In [0]:
# Imports here
!pip install torch
!pip install torchvision
import torch,torchvision
#!pip install -I pillow
import numpy as np
import torchvision.transforms as transforms
from torch.autograd import Variable

#!pip install Pillow==4.0.0
#!pip install PIL
#!pip install image
#import PIL
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
import re
from nltk.corpus import stopwords
import pickle
import nltk
from collections import defaultdict
import copy
from collections import Counter
from tqdm import tqdm as tqdm
import pandas as pd
import numpy as np
from torch.optim.lr_scheduler import CosineAnnealingLR
data_dir = '/drive/My Drive/Study/fakenews/'
isPreprocess = True
from google.colab import drive
drive.mount('/drive/')


Collecting torch
  Downloading https://files.pythonhosted.org/packages/7e/60/66415660aa46b23b5e1b72bc762e816736ce8d7260213e22365af51e8f9c/torch-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (591.8MB)
    100% |████████████████████████████████| 591.8MB 27kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x62192000 @  0x7f3f26ee12a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641
Installing collected packages: torch
Successfully installed torch-1.0.0
Collecting torchvision
  Downloading https://files.pythonhosted.org/packages/ca/0d/f00b2885711e08bd71242ebe7b96561e6f6d01fdb4b9dcf4d37e2e13c5e1/torchvision-0.2.1-py2.py3-none-any.whl (54kB)
    100% |████████████████████████████████| 61kB 4.0MB/s 
Collecting pillow>=4.1.1 (from torchvision)
  Downloading https://files.pythonhosted.org/packages/62/94/5430ebaa83f91cc7a9f687ff5238e26164a779cca2ef9903232268b0a318/Pillow-5.3.0-cp36-cp36m-manylinux1_x86_64.whl (2.0MB)
    100% |████████████████████████████████| 2.0MB 12.4MB/s 
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from torchvision) (1.11.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torchvision) (1.14.6)
Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchvision) (1.0.0)
Installing collected packages: pillow, torchvision
  Found existing installation: Pillow 4.0.0
    Uninstalling Pillow-4.0.0:
      Successfully uninstalled Pillow-4.0.0
Successfully installed pillow-5.3.0 torchvision-0.2.1
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /drive/

In [0]:


In [0]:
import os
print(os.listdir("."))
print(os.listdir("/drive/My Drive"))
#os.chdir("drive/Colab/")
#print(os.listdir("."))


['.config', 'sample_data']
['IMG_1806.MOV', 'Teradata - работа', 'поиск вакансий', 'Проекты по кодингу', 'Дипломы, эссе, презентации', 'Study', 'Google Фото', 'python', 'Манифест неоэкзистенциализма.gdoc', 'финансы.xlsx.gsheet', 'финансы 23.02.2018.xlsx', 'Colab Notebooks', 'vpn.zip', 'Kaggle: homecredit.gsheet', 'temp', 'TGS_SALT-master.zip', 'квартиры.gsheet', '2kzn.pdf', '2msk.pdf', 'финансы 23.02.2018.xlsx.gsheet', 'Мебель.gsheet']

PREPROCESSING


In [0]:
'''
#nltk.download('stopwords')
stopwords_en = stopwords.words('english')
stopwords_en.remove("not")
stopwords_en.remove("no")
stopwords_en.remove("nor")
'''
stopwords_en = ['the', 'a' 'an']
def make_new_data(df):

    title1_en = list(df["title1_en"])
    title2_en = list(df["title2_en"])
    title1_zh = list(df["title1_zh"])
    title2_zh = list(df["title2_zh"])
    labels = list(df["label"])
    id1_train = list(df["tid1"])
    id2_train = list(df["tid2"])


    # id-text dictionary
    id_to_text_en = defaultdict(list)
    id_to_text_zh = defaultdict(list)
    for idx, id1 in enumerate(id1_train):
        #if not id1 in id_to_text_en.keys():
        id_to_text_en[id1] = title1_en[idx]
        id_to_text_zh[id1] = title1_zh[idx]

    for idx, id2 in enumerate(id2_train):
        #if not id2 in id_to_text_en.keys():
        id_to_text_en[id2] = title2_en[idx]
        id_to_text_zh[id2] = title2_zh[idx]


    # key : id,
    # value : id of agreed text or diagreed text.
    agree_dic = defaultdict(list)
    disagree_dic = defaultdict(list)
    
    given_dic = defaultdict(list)
    bidirection_dic = defaultdict(list)

    fixed_dic = defaultdict(list)

    for idx, id1 in enumerate(id1_train):
        label = labels[idx]
        id2 = id2_train[idx]
        given_dic[id1].append((id2, label))

    # for idx, id1 in enumerate(id1_train):
    #     label = labels[idx]
    #     id2 = id2_train[idx]
    #     given_dic[id1].append((id2, label))
    
    for idx, id1 in enumerate(id1_train):
        label = labels[idx]
        id2 = id2_train[idx]

        if not len(fixed_dic[id1]) == 0:
            already_given_id = np.array(fixed_dic[id1])[:,0]
            already_given_label = np.array(fixed_dic[id1])[:,1]
            if not id2 in already_given_id:
                fixed_dic[id1].append([id2, label])
            else:
                id2_idx = list(already_given_id).index(id2)
                already_given = already_given_label[id2_idx]
                if not label == already_given:
                    #print(id1, id2, already_given, label)
                    if label == 0:
                        pass
                    elif label == 1 and already_given == 0:
                        true_label = 1                        
                        fixed_dic[id1][id2_idx][1] = true_label
                    elif label == 2 or already_given == 2:
                        true_label = 2
                        fixed_dic[id1][id2_idx][1] = true_label
                    #print(id1, given_dic[id1][id2_idx])
                else:
                    pass

        else:
            #最初に登録するとき
            fixed_dic[id1].append([id2, label])

        if not len(fixed_dic[id2]) == 0:
            already_given_id = np.array(fixed_dic[id2])[:,0]
            already_given_label = np.array(fixed_dic[id2])[:,1]
            if not id1 in already_given_id:
                fixed_dic[id2].append([id1, label])
            else:
                id1_idx = list(already_given_id).index(id1)
                already_given = already_given_label[id1_idx]
                if not label == already_given:
                    if label == 0:
                        pass
                    elif label == 1 and already_given == 0:
                        true_label = 1
                        fixed_dic[id2][id1_idx][1] = true_label
                    elif label == 2 or already_given == 2:
                        true_label = 2
                        fixed_dic[id2][id1_idx][1] = true_label


        else:
            fixed_dic[id2].append([id1, label])
    print("agree dic:{}, disagree dic:{}".format(len(agree_dic), len(disagree_dic)))

    fixed_dic_cleaned = copy.deepcopy(fixed_dic)
    print("deleting dublicates")
    for id_, id_label_list in tqdm(fixed_dic_cleaned.items()):
        #print(id_label_list)
        if len(id_label_list) == 0:
            continue
        id_list = np.array(id_label_list)[:,0]
        for eachid in id_list:
            id_label_list2 = fixed_dic_cleaned[eachid]
            if len(id_label_list2) == 0:
                continue

            id_list2 = list(np.array(id_label_list2)[:,0])
            if id_ in id_list2:
                idx = list(id_list2).index(id_)
                id_label_list2.pop(idx)

    for id1, id_label_list in fixed_dic.items():
        if len(id_label_list) == 0:
            continue
        id_list = np.array(id_label_list)[:,0]
        label_list = np.array(id_label_list)[:,1]
        for id2, label in zip(id_list, label_list):

            if label == 1:
                agree_dic[id1].append(id2)
            elif label == 2:
                disagree_dic[id1].append(id2)

    new_data = []
    given_label_agree = []
    given_label_dis = []


    for id1, id_label_list in fixed_dic_cleaned.items():
        if len(id_label_list) == 0:
            continue
        id2_list = np.array(id_label_list)[:,0]
        label_list = np.array(id_label_list)[:,1]
        for id2, label in zip(id2_list, label_list):
            new_data.append((id_to_text_en[id1], id_to_text_en[id2], id_to_text_zh[id1], id_to_text_zh[id2], label))


    print("fixed data length:{}, original:{}".format(len(new_data), len(id1_train)))


    forecast_dic = defaultdict(list)

    for id_, agree_ids in agree_dic.items():
        disagree_ids = disagree_dic[id_]

        for agree_id in agree_ids:
            given_ids_labels= fixed_dic[agree_id]
            if len(given_ids_labels) == 0:
                continue
            given_ids = np.array(given_ids_labels)[:, 0]
            given_labels = np.array(given_ids_labels)[:, 1]
            assert given_ids.shape == given_labels.shape

            # new 'disagree data'
            for disagree_id in disagree_ids:
                if disagree_id in given_ids:
                    # When labels are already given
                    idx = list(given_ids).index(disagree_id)
                    label = given_labels[idx]
                    given_label_dis.append(label)
                    pass
                else:
                    # hen the label is not given explicitly
                    forecast_dic[agree_id].append((disagree_id, 2))
                    forecast_dic[disagree_id].append((agree_id, 2))
                    new_data.append((id_to_text_en[agree_id], id_to_text_en[disagree_id], id_to_text_zh[agree_id], id_to_text_zh[disagree_id], 2))

            # new 'agree data'
            for agree_id2 in agree_ids:
                if agree_id == agree_id2:
                    continue
                else:
                    if agree_id2 in given_ids:
                        # when labels are already given
                        idx = list(given_ids).index(agree_id2)
                        label = given_labels[idx]
                        given_label_agree.append(label)
                        pass
                    else:
                        pass
                        # when the label is not given explicitly.
                        forecast_dic[agree_id].append((agree_id2, 1))
                        forecast_dic[agree_id2].append((agree_id, 1))

                        # new_data.append((id_to_text_en[agree_id], id_to_text_en[agree_id2], id_to_text_zh[agree_id], id_to_text_zh[agree_id2], 1))

#     c = Counter(given_label_agree)
#     print("given_label_agree", c)
#     c = Counter(given_label_dis)
#     print("given_label_disagree", c)
    print("final data length:",len(new_data))
    with open(data_dir + 'save/fixed_dic.pickle', mode='wb') as f:
        pickle.dump(fixed_dic, f)
    with open(data_dir + 'save/given_dic.pickle', mode='wb') as f:
        pickle.dump(given_dic, f)
    with open(data_dir + 'save/forecast_dic.pickle', mode='wb') as f:
        pickle.dump(forecast_dic, f)

    return new_data, given_dic, fixed_dic, forecast_dic 
def preprocess_():

    train_df = pd.read_csv(data_dir + "train.csv")
    test_df = pd.read_csv(data_dir + "test.csv")
    sub = pd.read_csv(data_dir + "sample_submission.csv")

    def english_clean_series(series):
        # Uppercase letters ---> lowercase letters
        series = series.str.lower()

        def clean_seq(seq):
            seq = seq.replace("it's", "it is")
            seq = seq.replace("he's", "he is")
            seq = seq.replace("she's", "she is")
            seq = seq.replace("you're", "you are")
            seq = seq.replace("we're", "we are")
            seq = seq.replace("they're", "they are")
            seq = seq.replace("i'm", "i am")
            seq = seq.replace("don't", "do not")
            seq = seq.replace("does't", "does not")
            seq = seq.replace("didn't", "did not")
            seq = seq.replace("aren't", "are not")
            seq = seq.replace("weren't", "were not")
            seq = seq.replace("isn't", "is not")
            seq = seq.replace("wasn't", "was not")
            seq = seq.replace("haven't", "have not")
            seq = seq.replace("hasn't", "has not")
            seq = seq.replace("can't", "can not")
            seq = seq.replace("cannot", "can not")

            seq = seq.replace("shouldn't", "should not")
            seq = seq.replace("wouldn't", "would not")
            seq = seq.replace("couldn't", "could not")
            seq = seq.replace("mightn't", "might not")
            seq = seq.replace("mustn't", "must not")
            seq = seq.replace("needn't", "need not")
            seq = seq.replace("won't", "will not")

            seq = seq.replace("'s", "")
            seq = seq.replace("\n", "")
            seq = seq.replace("[", "")
            seq = seq.replace("]", "")
            seq = seq.replace(" the ", " ")
            seq = seq.replace(" a ", " ")
            seq = seq.replace(" an ", " ")


            seq = seq.replace("< i >", "")
            seq = seq.replace("< / i >", "")

            seq = re.sub(r'[,."''“”。、#()→⇒←↓↑:;_㊙️【《》=|/+<>]+', '', seq)
            seq = seq.replace(r'-', ' - ')
            seq = seq.replace(r'!', ' ! ')
            seq = seq.replace(r'?', ' ? ') 
            seq = seq.replace(r'?', ' ? ')
            seq = seq.replace(r'!', ' ! ')
            seq = seq.replace(r'?', ' ? ')
            seq = re.sub(r'[$]+', '$ ', seq)
            seq = re.sub(r'[0-9]+', '<NUM>', seq)

            seq_split = seq.split(" ")

            new_seq = ""
            for word in seq_split:
                if not word in stopwords_en:
                    new_seq += word
                    new_seq += " "
            return new_seq
        '''
            with open('save/top_words.pickle', mode='rb') as f:
                top_words = pickle.load(f)

            # Leave frequent top 20000 words. Do we need them???
            seq = new_seq
            seq_split = seq.split(" ")
            new_seq = ""
            for word in seq_split:
                if word in top_words:
                    new_seq += word
                    new_seq += " "

            return new_seq
        series = series.apply(clean_seq)
        '''
 
        return series.apply(clean_seq)

    def chinese_clean_series(series):
        def clean_seq(seq):
            seq = str(seq)
            seq = seq.replace("< i >", "")
            seq = seq.replace("< / i >", "")
            seq = seq.replace("\n", "")
            seq = re.sub(r'[,."''“”。、#()→⇒←↓↑:;_㊙️【《》=|/<>]+', '', seq)
            #seq = re.sub(r'[!!??-]+', ' ', seq)
            seq = seq.replace(r'-', ' - ')
            seq = seq.replace(r'!', ' ! ')
            seq = seq.replace(r'?', ' ? ') 
            seq = seq.replace(r'?', ' ? ')
            seq = seq.replace(r'!', ' ! ')
            seq = seq.replace(r'?', ' ? ')
            seq = re.sub(r'[$]+', '$ ', seq)
            seq = re.sub(r'万', '00', seq)            
            seq = re.sub(r'[0-9]+', '<NUM>', seq)

            return seq

        series = series.apply(clean_seq)
        return series

    train_df["title1_en"] = english_clean_series(train_df["title1_en"])
    train_df["title2_en"] = english_clean_series(train_df["title2_en"])
    train_df["title1_zh"] =  chinese_clean_series(train_df["title1_zh"])
    train_df["title2_zh"] =  chinese_clean_series(train_df["title2_zh"])

    test_df["title1_en"] = english_clean_series(test_df["title1_en"])
    test_df["title2_en"] = english_clean_series(test_df["title2_en"])
    test_df["title1_zh"] =  chinese_clean_series(test_df["title1_zh"])
    test_df["title2_zh"] =  chinese_clean_series(test_df["title2_zh"])

    train_df.replace('unrelated', 0, inplace=True)
    train_df.replace('agreed', 1, inplace=True)
    train_df.replace('disagreed', 2, inplace=True)

    y = list(train_df["label"])


    #Create a word dictionary

    train_t1_en = train_df["title1_en"]
    train_t2_en = train_df["title2_en"]

    test_t1_en = test_df["title1_en"]
    test_t2_en = test_df["title2_en"]

    train_t1_zh = train_df["title1_zh"]
    train_t2_zh = train_df["title2_zh"]
    test_t1_zh = test_df["title1_zh"]
    test_t2_zh = test_df["title2_zh"]

    label = train_df["label"]
    print(train_t1_en.head())
    word_to_ix_en = {}
    for title1, title2 in zip(tqdm(train_t1_en), train_t2_en):         
        for word in title1.split():
            if word not in word_to_ix_en.keys():
                word_to_ix_en[word] = len(word_to_ix_en)+1
        for word in title2.split():
            if word not in word_to_ix_en.keys():
                word_to_ix_en[word] = len(word_to_ix_en)+1

    for title1, title2 in zip(tqdm(test_t1_en), test_t2_en):
        for word in title1.split():
            if word not in word_to_ix_en.keys():
                word_to_ix_en[word] = len(word_to_ix_en)+1
        for word in title2.split():
            if word not in word_to_ix_en.keys():
                word_to_ix_en[word] = len(word_to_ix_en)+1


    #Chinese
    word_to_ix_zh = {}
    for title1, title2 in zip(tqdm(train_t1_zh), train_t2_zh):
        for word in title1:
            if word not in word_to_ix_zh.keys():
                word_to_ix_zh[word] = len(word_to_ix_zh)+1
        for word in title2:
            if word not in word_to_ix_zh.keys():
                word_to_ix_zh[word] = len(word_to_ix_zh)+1

    for title1, title2 in zip(tqdm(test_t1_zh), test_t2_zh):
        for word in title1:
            if word not in word_to_ix_zh.keys():
                word_to_ix_zh[word] = len(word_to_ix_zh)+1
        for word in title2:
            if word not in word_to_ix_zh.keys():
                word_to_ix_zh[word] = len(word_to_ix_zh)+1

    print("the number of english words:{}, chinese words:{}".format(len(word_to_ix_en), len(word_to_ix_zh)))

    with open(data_dir + 'save/word_to_ix_en.pickle', mode='wb') as f:
        pickle.dump(word_to_ix_en, f)
    with open(data_dir + 'save/word_to_ix_zh.pickle', mode='wb') as f:
        pickle.dump(word_to_ix_zh, f)
    with open(data_dir + 'save/train_df.pickle', mode='wb') as f:
        pickle.dump(train_df, f)
    with open(data_dir + 'save/test_df.pickle', mode='wb') as f:
        pickle.dump(test_df, f)

    print("cleaned df, word to ix saved.")


    # The agree articles to A may be in a disagreeous relationship with disagree articles in A?

    # with open('save/word_to_ix_en.pickle', mode='rb') as f:
    #      word_to_ix_en = pickle.load(f)
    # with open('save/word_to_ix_zh.pickle', mode='rb') as f:
    #      word_to_ix_zh = pickle.load(f)
    # with open('save/train_df.pickle', mode='rb') as f:
    #      train_df = pickle.load(f)
    # with open('save/test_df.pickle', mode='rb') as f:
    #      test_df = pickle.load(f)

    #
    # title1_en = list(train_df["title1_en"])
    # title2_en = list(train_df["title2_en"])
    # title1_zh = list(train_df["title1_zh"])
    # title2_zh = list(train_df["title2_zh"])
    # labels = list(train_df["label"])
    #
    # id1 = list(train_df["tid1"])
    # id2 = list(train_df["tid2"])
    #
    # #id1_train, id1_val, train1_en, val1_en, train1_zh, val1_zh, id2_train, id2_val, train2_en, val2_en,train2_zh, val2_zh, y_train, y_val = train_test_split(id1, title1_en, title1_zh, id2, title2_en, title2_zh, labels, test_size=0.2, random_state=0)
    # training_df, val_df = train_test_split(train_df, test_size=0.2, random_state=0)
    #
    #
    # #new_data, _ = make_new_data(id1_train, id2_train, train1_en, train2_en, y_train)
    # new_data, _, _ = make_new_data(training_df)
    #
    # #print(len(new_data_en))
    #
    # train1_en, train2_en = [],[]
    # train1_zh, train2_zh = [],[]
    # y_train = []
    # for text1_en, text2_en, text1_zh, text2_zh,label in new_data:
    #         train1_en.append(text1_en)
    #         train2_en.append(text2_en)
    #         train1_zh.append(text1_zh)
    #         train2_zh.append(text2_zh)
    #         y_train.append(label)
    #
    # # new_data_zh, _ = make_new_data(id1_train, id2_train, train1_zh, train2_zh, y_train)
    # # print(len(new_data_zh))
    # # for text1, text2, label in new_data_zh:
    # #         train1_zh.append(text1)
    # #         train2_zh.append(text2)
    # #
    #
    # val1_en, val2_en = list(val_df["title1_en"]), list(val_df["title2_en"])
    # val1_zh, val2_zh = list(val_df["title1_zh"]), list(val_df["title2_zh"])
    # y_val = list(val_df["label"])
    #
    # assert len(train1_zh)==len(train1_en)  and len(y_train)==len(train1_zh)
    #
    #
    #
    # print("training data:{}, validation data:{}".format(len(y_train), len(y_val)))

    return 0

    # return (train1_en, val1_en, train1_zh, val1_zh, train2_en, val2_en,train2_zh, val2_zh, y_train, y_val)
if isPreprocess==True:
    preprocess_()


  3%|▎         | 8647/320552 [00:00<00:03, 86467.81it/s]
0    there are two new old - age insurance benefits...
1    if you do not come to shenzhen sooner or later...
2    if you do not come to shenzhen sooner or later...
3    if you do not come to shenzhen sooner or later...
4    how to discriminate oil from gutter oil by mea...
Name: title1_en, dtype: object
100%|██████████| 320552/320552 [00:02<00:00, 110104.05it/s]
100%|██████████| 80126/80126 [00:00<00:00, 109490.65it/s]
100%|██████████| 320552/320552 [00:03<00:00, 84215.54it/s]
100%|██████████| 80126/80126 [00:00<00:00, 85789.30it/s]
the number of english words:42813, chinese words:5218
cleaned df, word to ix saved.

MODELS


In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

import re

from nltk.corpus import stopwords
import nltk


class LSTM_Classifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size_en, vocab_size_zh, target_size=3, seq_length_en=50, seq_length_zh=140):
        super(LSTM_Classifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.word_embeddings_en = nn.Embedding(vocab_size_en+1, embedding_dim, padding_idx=0)
        self.word_embeddings_zh = nn.Embedding(vocab_size_zh+1, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm_en = nn.LSTM(embedding_dim, hidden_dim, batch_first=False, num_layers=2)
        self.lstm_zh = nn.LSTM(embedding_dim, hidden_dim, batch_first=False, num_layers=2)

        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim*2)
        self.fc1_drop = nn.Dropout(p=0.5, inplace=False)

        self.fc2 = nn.Linear(hidden_dim*2, target_size)
        self.initial_hidden = self.init_hidden()


        self.seq_length_en=seq_length_en
        self.seq_length_zh=seq_length_zh

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, title1_en, title2_en, title1_zh, title2_zh):
        batch = title1_en.shape[0]

        embeds1_en = self.word_embeddings_en(title1_en)
        embeds2_en = self.word_embeddings_en(title2_en)

        embeds1_zh = self.word_embeddings_zh(title1_zh)
        embeds2_zh = self.word_embeddings_zh(title2_zh)

        # seq_length * batch * feature_dims
        embeds1_en = embeds1_en.view(self.seq_length_en, batch, self.embedding_dim)
        embeds2_en = embeds2_en.view(self.seq_length_en, batch, self.embedding_dim)

        embeds1_zh = embeds1_zh.view(self.seq_length_zh, batch, self.embedding_dim)
        embeds2_zh = embeds2_zh.view(self.seq_length_zh, batch, self.embedding_dim)

        #print("embeds1_en", embeds1_en.size())

        lstm_out1_en, self.hidden = self.lstm_en(embeds1_en)#, self.initial_hidden)
        lstm_out2_en, self.hidden = self.lstm_en(embeds2_en)
        lstm_out1_zh, self.hidden = self.lstm_zh(embeds1_zh)
        lstm_out2_zh, self.hidden = self.lstm_zh(embeds1_zh)

        en_sum = lstm_out1_en[-1] + lstm_out2_en[-1]
        zh_sum = lstm_out1_zh[-1] + lstm_out2_zh[-1]
        #print("embedding size:",en_sum.size(), zh_sum.size())

        concat = torch.cat((en_sum, zh_sum), dim=1)
        #print("lstm out:", lstm_out1[-1].size())
        #print("concat:", concat.size())

        fc1 = self.fc1_drop(F.relu(self.fc1(concat)))
        fc2 = self.fc2(fc1)

        return fc2



class MLP_Classifier(nn.Module):

    def __init__(self, embedding_dim, vocab_size, target_size=3, seq_length=50):
        super(MLP_Classifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)


        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(embedding_dim*2, embedding_dim*2)
        self.fc1_bn = nn.BatchNorm1d(embedding_dim*2)
        self.fc1_drop = nn.Dropout(p=0.5, inplace=False)

        self.fc2 = nn.Linear(embedding_dim*2, target_size)

        self.seq_length=seq_length

    def forward(self, sentence1, sentence2):
        embeds1 = self.word_embeddings(sentence1)
        embeds1 = torch.sum(embeds1, 1)
        #print("embed", embeds1.size())


        embeds2 = self.word_embeddings(sentence2)
        embeds2 = torch.sum(embeds2, 1)

        #print("embedding size:",embeds1.size(), len(sentence1))

        #embeds1 = embeds1.view(self.seq_length, len(sentence1), self.embedding_dim)
        #embeds2 = embeds2.view(self.seq_length, len(sentence1), self.embedding_dim)

        concat = torch.cat((embeds1, embeds2), dim=1)
        #print("concat:", concat.size())

        fc1 = self.fc1_drop(F.relu(self.fc1_bn(self.fc1(concat))))
        fc2 = self.fc2(fc1)

        return fc2

#Combine English and Chinese.

class Twolang_Classifier(nn.Module):

    def __init__(self, embedding_dim, vocab_size_en, vocab_size_zh, target_size=3, seq_length_en=50, seq_length_zh=100, kernel_num=64):
        super(Twolang_Classifier, self).__init__()

        self.embedding_dim = embedding_dim
        self.seq_length_en=seq_length_en
        self.seq_length_zh=seq_length_zh

        self.word_embeddings_en = nn.Embedding(vocab_size_en+1, embedding_dim, padding_idx=0)
        self.word_embeddings_zh = nn.Embedding(vocab_size_zh+1, embedding_dim, padding_idx=0)


        self.kernel_num=kernel_num
        self.conv2_en = nn.Conv2d(1, kernel_num, (2, embedding_dim))
        self.conv3_en = nn.Conv2d(1, kernel_num, (3, embedding_dim))
        self.conv4_en = nn.Conv2d(1, kernel_num, (4, embedding_dim))

        self.conv2 = nn.Conv2d(1, kernel_num, (2, embedding_dim))
        self.conv3 = nn.Conv2d(1, kernel_num, (3, embedding_dim))
        self.conv4 = nn.Conv2d(1, kernel_num, (4, embedding_dim))
        #self.conv5 = nn.Conv2d(1, kernel_num, (5, embedding_dim))

        self.Max2_pool_en = nn.MaxPool2d((self.seq_length_en-2+1, 1))
        self.Max3_pool_en = nn.MaxPool2d((self.seq_length_en-3+1, 1))
        self.Max4_pool_en = nn.MaxPool2d((self.seq_length_en-4+1, 1))
        #self.Max5_pool = nn.MaxPool2d((self.seq_length-5+1, 1))
        self.Max2_pool = nn.MaxPool2d((self.seq_length_zh-2+1, 1))
        self.Max3_pool = nn.MaxPool2d((self.seq_length_zh-3+1, 1))
        self.Max4_pool = nn.MaxPool2d((self.seq_length_zh-4+1, 1))


        # The linear layer that maps from hidden state space to tag space
        #self.fc1 = nn.Linear(embedding_dim*4, embedding_dim*4)
        #self.fc1_bn = nn.BatchNorm1d(embedding_dim*4)
        # self.fc1 = nn.Linear(embedding_dim+kernel_num*3, embedding_dim+kernel_num*3)
        self.fc1 = nn.Linear(kernel_num*6, kernel_num*6)

        self.fc1_bn = nn.BatchNorm1d(kernel_num*6)
        self.fc1_drop = nn.Dropout(p=0.5, inplace=False)

        self.fc2 = nn.Linear(kernel_num*6, target_size)


    def forward(self, title1_en, title2_en, title1_zh, title2_zh):
        batch = title1_en.shape[0]

        embeds1_en = self.word_embeddings_en(title1_en)
        #embeds1_en = torch.sum(embeds1_en, 1)
        embeds1_en = embeds1_en.view(batch, 1, self.seq_length_en, self.embedding_dim)

        embeds2_en = self.word_embeddings_en(title2_en)
        #embeds2_en = torch.sum(embeds2_en, 1)
        embeds2_en = embeds2_en.view(batch, 1, self.seq_length_en, self.embedding_dim)

        #Convolution
        embeds1_x2 = F.relu(self.conv2_en(embeds1_en))
        embeds1_x3 = F.relu(self.conv3_en(embeds1_en))
        embeds1_x4 = F.relu(self.conv4_en(embeds1_en))
        #embeds1_x5 = F.relu(self.conv5(embeds1_zh))

        embeds2_x2 = F.relu(self.conv2_en(embeds2_en))
        embeds2_x3 = F.relu(self.conv3_en(embeds2_en))
        embeds2_x4 = F.relu(self.conv4_en(embeds2_en))
        #embeds2_x5 = F.relu(self.conv5(embeds2_zh))

        # Pooling
        embeds1_x2 = self.Max2_pool_en(embeds1_x2).view(batch, -1)
        embeds1_x3 = self.Max3_pool_en(embeds1_x3).view(batch, -1)
        embeds1_x4 = self.Max4_pool_en(embeds1_x4).view(batch, -1)
        #embeds1_x5 = self.Max5_pool(embeds1_x5).view(batch, -1)

        embeds2_x2 = self.Max2_pool_en(embeds2_x2).view(batch, -1)
        embeds2_x3 = self.Max3_pool_en(embeds2_x3).view(batch, -1)
        embeds2_x4 = self.Max4_pool_en(embeds2_x4).view(batch, -1)
        #embeds2_x5 = self.Max5_pool(embeds2_x5).view(batch, -1)


        embeds1_en = torch.cat((embeds1_x2, embeds1_x3, embeds1_x4), dim=1)
        embeds2_en = torch.cat((embeds2_x2, embeds2_x3, embeds2_x4), dim=1)


        en_sum = embeds1_en + embeds2_en



        embeds1_zh = self.word_embeddings_zh(title1_zh)
        #embeds1_zh = torch.sum(embeds1_zh, 1)
        #For CNN.
        embeds1_zh = embeds1_zh.view(batch, 1, self.seq_length_zh, self.embedding_dim)

        embeds2_zh = self.word_embeddings_zh(title2_zh)
        #embeds2_zh = torch.sum(embeds2_zh, 1)
        #For CNN.
        embeds2_zh = embeds2_zh.view(batch, 1, self.seq_length_zh, self.embedding_dim)

        #Convolution
        embeds1_x2 = F.relu(self.conv2(embeds1_zh))
        embeds1_x3 = F.relu(self.conv3(embeds1_zh))
        embeds1_x4 = F.relu(self.conv4(embeds1_zh))
        #embeds1_x5 = F.relu(self.conv5(embeds1_zh))

        embeds2_x2 = F.relu(self.conv2(embeds2_zh))
        embeds2_x3 = F.relu(self.conv3(embeds2_zh))
        embeds2_x4 = F.relu(self.conv4(embeds2_zh))
        #embeds2_x5 = F.relu(self.conv5(embeds2_zh))

        # Pooling
        embeds1_x2 = self.Max2_pool(embeds1_x2).view(batch, -1)
        embeds1_x3 = self.Max3_pool(embeds1_x3).view(batch, -1)
        embeds1_x4 = self.Max4_pool(embeds1_x4).view(batch, -1)
        #embeds1_x5 = self.Max5_pool(embeds1_x5).view(batch, -1)

        embeds2_x2 = self.Max2_pool(embeds2_x2).view(batch, -1)
        embeds2_x3 = self.Max3_pool(embeds2_x3).view(batch, -1)
        embeds2_x4 = self.Max4_pool(embeds2_x4).view(batch, -1)
        #embeds2_x5 = self.Max5_pool(embeds2_x5).view(batch, -1)


        embeds1_zh = torch.cat((embeds1_x2, embeds1_x3, embeds1_x4), dim=1)
        embeds2_zh = torch.cat((embeds2_x2, embeds2_x3, embeds2_x4), dim=1)

        zh_sum = embeds1_zh + embeds2_zh

        #print("embedding size:",embeds1.size(), len(sentence1))

        #embeds1 = embeds1.view(self.seq_length, len(sentence1), self.embedding_dim)
        #embeds2 = embeds2.view(self.seq_length, len(sentence1), self.embedding_dim)

        #concat = torch.cat((embeds1_en, embeds2_en, embeds1_zh, embeds2_zh), dim=1)
        concat = torch.cat((en_sum, zh_sum), dim=1)

        fc1 = self.fc1_drop(F.relu(self.fc1_bn(self.fc1(concat))))
        fc2 = self.fc2(fc1)

        return fc2


class Text_CNN_Classifier(nn.Module):

    def __init__(self, embedding_dim, vocab_size, target_size=3, seq_length=50):
        super(Text_CNN_Classifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.word_embeddings = nn.Embedding(vocab_size+1, embedding_dim, padding_idx=0)
        self.seq_length=seq_length

        self.conv3_1 = nn.Conv2d(1, 1, (3, embedding_dim))
        self.conv4_1 = nn.Conv2d(1, 1, (4, embedding_dim))
        self.conv5_1 = nn.Conv2d(1, 3, (5, embedding_dim))
        self.conv3_2 = nn.Conv2d(1, 1, (3, embedding_dim))
        self.conv4_2 = nn.Conv2d(1, 1, (4, embedding_dim))
        self.conv5_2 = nn.Conv2d(1, 1, (5, embedding_dim))

        self.Max3_pool = nn.MaxPool2d((self.seq_length-3+1, 1))
        self.Max4_pool = nn.MaxPool2d((self.seq_length-4+1, 1))
        self.Max5_pool = nn.MaxPool2d((self.seq_length-5+1, 1))

        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(6, target_size)


    def forward(self, sentence1, sentence2):
        batch = len(sentence1)
        embeds1 = self.word_embeddings(sentence1)
        embeds2 = self.word_embeddings(sentence2)

        embeds1 = embeds1.view(len(sentence1), 1, self.seq_length, self.embedding_dim)
        embeds2 = embeds2.view(len(sentence2), 1, self.seq_length, self.embedding_dim)

        # Convolution
        embeds1_x1 = F.relu(self.conv3_1(embeds1))
        embeds1_x2 = F.relu(self.conv4_1(embeds1))
        embeds1_x3 = F.relu(self.conv5_1(embeds1))
    #         embeds2_x1 = F.relu(self.conv3_2(embeds2))
    #         embeds2_x2 = F.relu(self.conv4_2(embeds2))
    #         embeds2_x3 = F.relu(self.conv5_2(embeds2))
        embeds2_x1 = F.relu(self.conv3_1(embeds2))
        embeds2_x2 = F.relu(self.conv4_1(embeds2))
        embeds2_x3 = F.relu(self.conv5_1(embeds2))

        # Pooling
        embeds1_x1 = self.Max3_pool(embeds1_x1)
        embeds1_x2 = self.Max4_pool(embeds1_x2)
        embeds1_x3 = self.Max5_pool(embeds1_x3)
        embeds2_x1 = self.Max3_pool(embeds2_x1)
        embeds2_x2 = self.Max4_pool(embeds2_x2)
        embeds2_x3 = self.Max5_pool(embeds2_x3)

        #print("max pool size:", embeds2_x3.size())

        concat = torch.cat((embeds1_x1, embeds1_x2, embeds1_x3, embeds2_x1, embeds2_x2, embeds2_x3), -1)
        x = concat.view(batch, -1)
        #print("concat:", x.size())

        fc1 = self.fc1(x)
        #print("fc1:", fc1.size())

        return fc1

DATASET


In [0]:
import pandas as pd
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


class BERTDataset(Dataset):
    def __init__(self, titles1_en, titles2_en, labels, tokenizer, seq_length=100):

        self.titles1_en = titles1_en
        self.titles2_en = titles2_en
        self.labels = labels
        self.tokenizer = tokenizer
        self.seq_length=seq_length

    def __len__(self):
        return len(self.titles1_en)


    def __getitem__(self, idx):
        seq_length = self.seq_length
        tokenizer = self.tokenizer

        title1_en = self.titles1_en[idx]
        tokens_a = tokenizer.tokenize(title1_en)
        #indexed_tokens_title1_en = tokenizer.convert_tokens_to_ids(tokenized_title1_en)


        title2_en = self.titles2_en[idx]
        tokens_b = tokenizer.tokenize(title2_en)
        #indexed_tokens_title2_en = tokenizer.convert_tokens_to_ids(tokenized_title2_en)



        def _truncate_seq_pair(tokens_a, tokens_b, max_length):
            """Truncates a sequence pair in place to the maximum length."""

            # This is a simple heuristic which will always truncate the longer sequence
            # one token at a time. This makes more sense than truncating an equal percent
            # of tokens from each, since if one sequence is very short then each token
            # that's truncated likely contains more information than a longer sequence.
            while True:
                total_length = len(tokens_a) + len(tokens_b)
                if total_length <= max_length:
                    break
                if len(tokens_a) > len(tokens_b):
                    tokens_a.pop()
                else:
                    tokens_b.pop()


        _truncate_seq_pair(tokens_a, tokens_b, seq_length-3)


        tokens = []
        input_type_ids = []

        tokens.append("[CLS]")
        input_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            input_type_ids.append(0)
        tokens.append("[SEP]")
        input_type_ids.append(0)

        for token in tokens_b:
            tokens.append(token)
            input_type_ids.append(1)
        tokens.append("[SEP]")
        input_type_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)

        # Zero padding.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)


        #print("input_ids:{}, input_mask:{}, input_type_ids:{}".format(len(input_ids), len(input_mask), len(input_type_ids)))
        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length
        assert len(input_type_ids) == seq_length

        input_ids = torch.tensor(input_ids)
        input_mask = torch.tensor(input_mask)
        input_type_ids = torch.tensor(input_type_ids)
        labels = torch.tensor(self.labels[idx], dtype=torch.long)


        #
        #
        # tokens_tensor = torch.tensor(indexed_tokens_title1_en + indexed_tokens_title2_en)
        # segments_tensor = torch.tensor(len(indexed_tokens_title1_en) * [0] + len(indexed_tokens_title2_en) * [1])
        #
        # assert len(tokens_tensor) == len(segments_ids)
        #
        # label = torch.tensor(self.labels[idx], dtype=torch.long)
        #
        sample = {'input_ids': input_ids, 'input_mask': input_mask,
                    'input_type_ids':input_type_ids, 'label': labels}

        # if self.transform:
        #     sample = self.transform(sample, self.dic_en, self.dic_zh, self.seq_length_en, self.seq_length_zh)

        return sample

# Dataset
class TitleDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, titles1_en, titles2_en,
    titles1_zh, titles2_zh, labels, dic_en=None, dic_zh=None,
    transform=None, seq_length_en=50, seq_length_zh=140,
    if_test=False):

        self.titles1_en = titles1_en
        self.titles2_en = titles2_en
        self.titles1_zh = titles1_zh
        self.titles2_zh = titles2_zh

        self.labels = labels
        self.transform = transform
        self.dic_en=dic_en
        self.dic_zh=dic_zh

        self.seq_length_en=seq_length_en
        self.seq_length_zh=seq_length_zh

        self.if_test=if_test

    def __len__(self):
        return len(self.titles1_en)

    def __getitem__(self, idx):
        title1_en = self.titles1_en[idx]
        title2_en = self.titles2_en[idx]
        title1_zh = self.titles1_zh[idx]
        title2_zh = self.titles2_zh[idx]

        if self.if_test:
            # dummy label
            label = title1_en
        else:
            label = torch.tensor(self.labels[idx], dtype=torch.long)

        sample = {'t1_en': title1_en, 't2_en': title2_en, 't1_zh': title1_zh, 't2_zh': title2_zh, 'label': label}

        if self.transform:
            sample = self.transform(sample, self.dic_en, self.dic_zh, self.seq_length_en, self.seq_length_zh)

        return sample


class Toidx(object):
    def __call__(self, sample, word_to_idx_en, word_to_idx_zh, max_seq_length_en, max_seq_length_zh):

        def prepare_sequence(seq, to_ix, max_seq_length, language="english"):
            seq = str(seq)
            #zero padding and word--->ix in seq.
            if language == "english":
                idxs = [to_ix[w] for w in seq.split()]
            elif language == "chinese":
                idxs = [to_ix[w] for w in seq]


            if len(idxs) > max_seq_length:
                idxs = idxs[:max_seq_length]
            else:
                idxs += [0] * (max_seq_length - len(idxs))
            return torch.tensor(idxs, dtype=torch.long)

        t1_en, t2_en, t1_zh, t2_zh, label = sample['t1_en'], sample['t2_en'], sample['t1_zh'], sample['t2_zh'], sample["label"]
        return {'t1_en': prepare_sequence(t1_en, word_to_idx_en, max_seq_length_en, language="english"),
                    't2_en': prepare_sequence(t2_en, word_to_idx_en, max_seq_length_en,language="english"),
                    't1_zh': prepare_sequence(t1_zh, word_to_idx_zh, max_seq_length_zh,language="chinese"),
                    't2_zh': prepare_sequence(t2_zh, word_to_idx_zh, max_seq_length_zh,language="chinese"),
                    'label': label}

TRAIN


In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler

from sklearn.model_selection import train_test_split

import re
import os
import pickle

from nltk.corpus import stopwords
import nltk

#from preprocess import preprocess_, make_new_data

from sklearn.model_selection import KFold
def train(epoch):
    model.train()

    for batch_idx, sample_batch in enumerate((train_loader)):
        #print("batch_idx:",batch_idx)
        en_title1 = sample_batch["t1_en"].to(device)
        en_title2 = sample_batch["t2_en"].to(device)
        zh_title1 = sample_batch["t1_zh"].to(device)
        zh_title2 = sample_batch["t2_zh"].to(device)
        y = sample_batch["label"].to(device)
        scheduler.step()
        optimizer.zero_grad()
        outputs = model(en_title1, en_title2, zh_title1, zh_title2)

        loss = loss_function(outputs, y)
        loss.backward()
        optimizer.step()

        #optimizer.zero_grad()
        #outputs = model(en_title2, en_title1)

        #loss = loss_function(outputs, y)
        #loss.backward()
        #optimizer.step()

    print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))
    #print("train data all :", (batch_idx+1)*batch)
    return model

def test():
    with torch.no_grad():
        model.eval()
        test_loss = 0
        correct = 0

        for batch_idx, sample_batch in enumerate(val_loader):
            en_title1 = sample_batch["t1_en"].to(device)
            en_title2 = sample_batch["t2_en"].to(device)
            zh_title1 = sample_batch["t1_zh"].to(device)
            zh_title2 = sample_batch["t2_zh"].to(device)
            y = sample_batch["label"].to(device)

            output = model(en_title1, en_title2, zh_title1, zh_title2)

            # sum up batch loss
            test_loss += weighted_loss_function(output, y).item()
            # get the index of the max log-probability
            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(y.view_as(pred)).sum().item()

        #test_loss /= len(val_loader.dataset)
        test_loss /= batch_idx+1
        #accuracy = 100. * correct / len(val_loader.dataset)

        accuracy = weighted_accuracy(pred, y)

        print('Validation set: Weighted loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'
              .format(test_loss, correct, len(val_loader.dataset),
                      accuracy))

        return test_loss, accuracy


def weighted_accuracy(pred, true):
    true = true.cpu().numpy()
    pred = pred.cpu().numpy()

    class_weight = [1/16, 1/15, 1/5]
    score = 0
    perfect_score = 0

    for p, t in zip(pred, true):
        if p == t:
            if t == 0:
                score += 1/16
                perfect_score += 1/16
            elif t == 1:
                score += 1/15
                perfect_score += 1/15
            elif t == 2:
                score += 1/5
                perfect_score += 1/5
        else:
            if t == 0:
                perfect_score += 1/16
            elif t == 1:
                perfect_score += 1/15
            elif t == 2:
                perfect_score += 1/5
    #print("score:{}, ideal:{}".format(score, perfect_score))
    return 100 * score/perfect_score




def save_model(model, val_accuracy, save_path=data_dir + 'model'):
   # if os.path.exists(path + "*.model"):
   #     os.remove(path + "*.model")
    name = "{}fold_mlp.model".format(fold)
    PATH = os.path.join(save_path, name)
    torch.save(model, PATH)
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [0]:
'''
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

train_df = pd.read_csv(data_dir + "train.csv")
FOLDS_PATH = None
if FOLDS_PATH is None:
    folds = KFold(n_splits=5, shuffle=False, random_state=42)
    folds_idx = [(train_idx, val_idx) 
                 for train_idx, val_idx in folds.split(train_df)]

    with open(data_dir + 'save/5Kfolds.pkl', mode='wb') as f:
        pickle.dump(folds_idx, f)
print (folds_idx)
'''


Out[0]:
'\nimport pandas as pd\nimport numpy as np\nfrom sklearn.model_selection import KFold\n\ntrain_df = pd.read_csv(data_dir + "train.csv")\nFOLDS_PATH = None\nif FOLDS_PATH is None:\n    folds = KFold(n_splits=5, shuffle=False, random_state=42)\n    folds_idx = [(train_idx, val_idx) \n                 for train_idx, val_idx in folds.split(train_df)]\n\n    with open(data_dir + \'save/5Kfolds.pkl\', mode=\'wb\') as f:\n        pickle.dump(folds_idx, f)\nprint (folds_idx)\n'

In [0]:
EMBEDDING_DIM = 512
HIDDEN_DIM = 256
max_seq_en = 50
max_seq_zh = 100
EPOCH=9

batch=1024

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)

# with open('save/word_to_ix_en.pickle', mode='rb') as f:
#      word_to_ix_en = pickle.load(f)
# with open('save/word_to_ix_zh.pickle', mode='rb') as f:
#      word_to_ix_zh = pickle.load(f)

print("@preprocessing..")
#_ = preprocess_()

# Data loading
with open(data_dir + 'save/word_to_ix_en.pickle', mode='rb') as f:
     word_to_ix_en = pickle.load(f)
with open(data_dir + 'save/word_to_ix_zh.pickle', mode='rb') as f:
     word_to_ix_zh = pickle.load(f)
with open(data_dir + 'save/train_df.pickle', mode='rb') as f:
     train_df = pickle.load(f)
with open(data_dir + 'save/test_df.pickle', mode='rb') as f:
     test_df = pickle.load(f)
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)

fold_num = 5
kf = KFold(n_splits=fold_num, random_state = 42)
kf.get_n_splits(train_df)

train_data_list = []
val_data_list = []


'''
for train_index, val_index in kf.split(train_df):
    training_df = train_df.iloc[train_index]
    val_df = train_df.iloc[val_index]

    new_data, _, _, _ = make_new_data(training_df)
    train1_en, train2_en = [],[]
    train1_zh, train2_zh = [],[]
    y_train = []
    for text1_en, text2_en, text1_zh, text2_zh,label in new_data:
            train1_en.append(text1_en)
            train2_en.append(text2_en)
            train1_zh.append(text1_zh)
            train2_zh.append(text2_zh)
            y_train.append(label)
    val1_en, val2_en = list(val_df["title1_en"]), list(val_df["title2_en"])
    val1_zh, val2_zh = list(val_df["title1_zh"]), list(val_df["title2_zh"])
    y_val = list(val_df["label"])

    train_data_list.append((train1_en,train2_en,train1_zh,train2_zh,y_train))
    val_data_list.append((val1_en, val2_en,val1_zh, val2_zh,y_val))
with open(data_dir + 'save/kfold_train_data.pickle', mode='wb') as f:
    pickle.dump(train_data_list, f)
with open(data_dir + 'save/kfold_val_data.pickle', mode='wb') as f:
    pickle.dump(val_data_list, f)
'''   


with open(data_dir + 'save/kfold_train_data.pickle', mode='rb') as f:
     train_data_list = pickle.load(f)
with open(data_dir + 'save/kfold_val_data.pickle', mode='rb') as f:
     val_data_list = pickle.load(f)

PATH = data_dir+ "model/MLP.model"
PATH_list = [data_dir + "model/{}fold_mlp.model".format(fold) for fold in range(1,6,1)]

folds_accuracies = []
Pretrained = False
fold=1
for train_fold, val_fold in zip(train_data_list,val_data_list):
    print("{}/{} fold :".format(fold, fold_num))
    print("train length:{}, val length:{}".format(len(train_fold[0]), len(val_fold[0])))

    (train1_en,train2_en,train1_zh,train2_zh,y_train) = train_fold
    (val1_en, val2_en,val1_zh, val2_zh,y_val) = val_fold
 
    # Class weight gan be got as : n_samples / (n_classes * np.bincount(y))
    c = Counter(y_train)
    class_weight = []
    for label, num in sorted(c.items()):
        print(label, num)
        class_weight.append(len(y_train)/(3*num))
    #class_weight = torch.FloatTensor(class_weight).to(device)
    #print("class weight:", class_weight)

    model = LSTM_Classifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix_en), len(word_to_ix_zh), target_size=3, seq_length_en=max_seq_en,seq_length_zh=max_seq_zh)
    #model = MLP_Classifier(EMBEDDING_DIM, len(word_to_ix_en), target_size=3, seq_length=max_seq_en)
    #model = Text_CNN_Classifier(EMBEDDING_DIM, len(word_to_ix_en), target_size=3, seq_length=max_seq_length)
    
    #model = Twolang_Classifier(EMBEDDING_DIM, len(word_to_ix_en),len(word_to_ix_zh), target_size=3, kernel_num=64)

    model.to(device)
    train_dataset = TitleDataset(train1_en, train2_en, train1_zh, train2_zh, y_train,
                                 dic_en=word_to_ix_en, dic_zh=word_to_ix_zh, transform=Toidx(),
                                 seq_length_en=max_seq_en, seq_length_zh=max_seq_zh)

    val_dataset = TitleDataset(val1_en, val2_en, val1_zh, val2_zh, y_val,
                               dic_en=word_to_ix_en, dic_zh=word_to_ix_zh, transform=Toidx(),
                               seq_length_en=max_seq_en, seq_length_zh=max_seq_zh)


    class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in y_train])
    samples_weight = torch.from_numpy(samples_weight)
    samples_weight = samples_weight.double()
    sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))

    train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=False, sampler=sampler)#, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False)
    loss_function = nn.CrossEntropyLoss()#weight=class_weight)
    weighted_loss_function = nn.CrossEntropyLoss()#weight=class_weight)

    #optimizer = optim.SGD(model.parameters(), lr=0.001)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = CosineAnnealingLR(optimizer, len(train_loader), eta_min = 0.001/10)

    lowest_loss = 1000000000
    highest_accuracy = 0
    for epoch in range(EPOCH):
        #print(epoch+1)
        if Pretrained == True and highest_accuracy == 0:
                name = "{}fold_mlp.model".format(fold)
                PATH = os.path.join(data_dir + 'model', name)
                model = torch.load(PATH)             
                print ('Pretrained model loaded')
        model = train(epoch)
        val_loss, accuracy = test()

    #     if val_loss < lowest_loss:
    #         lowest_loss = val_loss
    #         save_model(model)
    
        if accuracy > highest_accuracy:
            #print("saving model...")
            highest_accuracy = accuracy
            save_model(model, highest_accuracy)
        print("highest_accuracy:{:.2f}% \n".format(highest_accuracy), 'current lr: ', get_lr(optimizer))
    folds_accuracies.append(highest_accuracy)
        #break
    fold +=1
print ('Final mean accuracy: ', np.mean(folds_accuracies))


device: cuda:0
@preprocessing..
1/5 fold :
train length:263272, val length:64111
0 175184
1 74249
2 13839
/usr/local/lib/python3.6/dist-packages/torch/utils/data/sampler.py:115: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  self.weights = torch.tensor(weights, dtype=torch.double)
epoch:1,train_loss:1.0936
Validation set: Weighted loss: 1.1045, Accuracy: 12800/64111 (17.52%)
/usr/local/lib/python3.6/dist-packages/torch/serialization.py:250: UserWarning: Couldn't retrieve source code for container of type LSTM_Classifier. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "
highest_accuracy:17.52% 
 current lr:  0.00010003336087903097
epoch:2,train_loss:1.0994
Validation set: Weighted loss: 1.1034, Accuracy: 11796/64111 (25.44%)
highest_accuracy:25.44% 
 current lr:  0.000999966639120969
epoch:3,train_loss:1.1019
Validation set: Weighted loss: 1.1032, Accuracy: 8310/64111 (18.15%)
highest_accuracy:25.44% 
 current lr:  0.00010003336087903097
epoch:4,train_loss:1.0992
Validation set: Weighted loss: 1.1005, Accuracy: 18324/64111 (26.17%)
highest_accuracy:26.17% 
 current lr:  0.000999966639120969
epoch:5,train_loss:1.0994
Validation set: Weighted loss: 1.0993, Accuracy: 17886/64111 (37.04%)
highest_accuracy:37.04% 
 current lr:  0.00010003336087903097
epoch:6,train_loss:1.0982
Validation set: Weighted loss: 1.0979, Accuracy: 18858/64111 (31.19%)
highest_accuracy:37.04% 
 current lr:  0.000999966639120969
epoch:7,train_loss:1.0985
Validation set: Weighted loss: 1.1001, Accuracy: 11068/64111 (9.23%)
highest_accuracy:37.04% 
 current lr:  0.00010003336087903097
epoch:8,train_loss:1.0979
Validation set: Weighted loss: 1.0962, Accuracy: 39346/64111 (52.24%)
highest_accuracy:52.24% 
 current lr:  0.000999966639120969
epoch:9,train_loss:1.0989
Validation set: Weighted loss: 1.0976, Accuracy: 23061/64111 (39.21%)
highest_accuracy:52.24% 
 current lr:  0.00010003336087903097
2/5 fold :
train length:262944, val length:64111
0 175489
1 73948
2 13507
epoch:1,train_loss:1.0992
Validation set: Weighted loss: 1.0924, Accuracy: 38196/64111 (60.73%)
highest_accuracy:60.73% 
 current lr:  0.00010003362099862035
epoch:2,train_loss:1.0991
Validation set: Weighted loss: 1.0939, Accuracy: 29605/64111 (51.46%)
highest_accuracy:60.73% 
 current lr:  0.0009999663790013797
epoch:3,train_loss:1.0982
Validation set: Weighted loss: 1.1014, Accuracy: 15226/64111 (21.52%)
highest_accuracy:60.73% 
 current lr:  0.00010003362099862035
epoch:4,train_loss:1.1002
Validation set: Weighted loss: 1.1102, Accuracy: 16204/64111 (15.48%)
highest_accuracy:60.73% 
 current lr:  0.0009999663790013797
epoch:5,train_loss:1.0982
Validation set: Weighted loss: 1.0990, Accuracy: 18327/64111 (26.81%)
highest_accuracy:60.73% 
 current lr:  0.00010003362099862035
epoch:6,train_loss:1.0985
Validation set: Weighted loss: 1.1013, Accuracy: 13743/64111 (25.46%)
highest_accuracy:60.73% 
 current lr:  0.0009999663790013797
epoch:7,train_loss:1.0989
Validation set: Weighted loss: 1.0995, Accuracy: 7523/64111 (21.90%)
highest_accuracy:60.73% 
 current lr:  0.00010003362099862035
epoch:8,train_loss:1.0992
Validation set: Weighted loss: 1.0978, Accuracy: 11646/64111 (20.55%)
highest_accuracy:60.73% 
 current lr:  0.0009999663790013797
epoch:9,train_loss:1.0986
Validation set: Weighted loss: 1.0984, Accuracy: 30293/64111 (44.23%)
highest_accuracy:60.73% 
 current lr:  0.00010003362099862035
3/5 fold :
train length:262800, val length:64110
0 175170
1 74245
2 13385
epoch:1,train_loss:1.0988
Validation set: Weighted loss: 1.0969, Accuracy: 24538/64110 (36.31%)
highest_accuracy:36.31% 
 current lr:  0.00010003362099862035
epoch:2,train_loss:1.1000
Validation set: Weighted loss: 1.0965, Accuracy: 22439/64110 (36.71%)
highest_accuracy:36.71% 
 current lr:  0.0009999663790013797
epoch:3,train_loss:1.0982
Validation set: Weighted loss: 1.0977, Accuracy: 25035/64110 (34.46%)
highest_accuracy:36.71% 
 current lr:  0.00010003362099862035
epoch:4,train_loss:1.0986
Validation set: Weighted loss: 1.1037, Accuracy: 4073/64110 (10.68%)
highest_accuracy:36.71% 
 current lr:  0.0009999663790013797
epoch:5,train_loss:1.0988
Validation set: Weighted loss: 1.0994, Accuracy: 14175/64110 (30.30%)
highest_accuracy:36.71% 
 current lr:  0.00010003362099862035
epoch:6,train_loss:1.0982
Validation set: Weighted loss: 1.0907, Accuracy: 27045/64110 (45.14%)
highest_accuracy:45.14% 
 current lr:  0.0009999663790013797
epoch:7,train_loss:1.0980
Validation set: Weighted loss: 1.0989, Accuracy: 20220/64110 (35.58%)
highest_accuracy:45.14% 
 current lr:  0.00010003362099862035
epoch:8,train_loss:1.0978
Validation set: Weighted loss: 1.0979, Accuracy: 26524/64110 (35.30%)
highest_accuracy:45.14% 
 current lr:  0.0009999663790013797
epoch:9,train_loss:1.0981
Validation set: Weighted loss: 1.1019, Accuracy: 17408/64110 (26.87%)
highest_accuracy:45.14% 
 current lr:  0.00010003362099862035
4/5 fold :
train length:263110, val length:64110
0 175217
1 74174
2 13719
epoch:1,train_loss:1.0988
Validation set: Weighted loss: 1.0942, Accuracy: 28322/64110 (37.67%)
highest_accuracy:37.67% 
 current lr:  0.00010003362099862035
epoch:2,train_loss:1.0998
Validation set: Weighted loss: 1.1062, Accuracy: 6875/64110 (19.42%)
highest_accuracy:37.67% 
 current lr:  0.0009999663790013797
epoch:3,train_loss:1.0989
Validation set: Weighted loss: 1.0969, Accuracy: 29312/64110 (41.20%)
highest_accuracy:41.20% 
 current lr:  0.00010003362099862035
epoch:4,train_loss:1.0985
Validation set: Weighted loss: 1.0929, Accuracy: 39853/64110 (53.63%)
highest_accuracy:53.63% 
 current lr:  0.0009999663790013797
epoch:5,train_loss:1.0986
Validation set: Weighted loss: 1.0967, Accuracy: 36688/64110 (53.20%)
highest_accuracy:53.63% 
 current lr:  0.00010003362099862035
epoch:6,train_loss:1.0988
Validation set: Weighted loss: 1.0988, Accuracy: 18594/64110 (22.20%)
highest_accuracy:53.63% 
 current lr:  0.0009999663790013797
epoch:7,train_loss:1.0983
Validation set: Weighted loss: 1.0975, Accuracy: 34044/64110 (53.20%)
highest_accuracy:53.63% 
 current lr:  0.00010003362099862035
epoch:8,train_loss:1.0979
Validation set: Weighted loss: 1.1006, Accuracy: 18585/64110 (22.20%)
highest_accuracy:53.63% 
 current lr:  0.0009999663790013797
epoch:9,train_loss:1.0985
Validation set: Weighted loss: 1.0994, Accuracy: 15315/64110 (17.08%)
highest_accuracy:53.63% 
 current lr:  0.00010003362099862035
5/5 fold :
train length:263017, val length:64110
0 175190
1 74203
2 13624
epoch:1,train_loss:1.0988
Validation set: Weighted loss: 1.0977, Accuracy: 21884/64110 (37.37%)
highest_accuracy:37.37% 
 current lr:  0.00010003362099862035
epoch:2,train_loss:1.0989
Validation set: Weighted loss: 1.1037, Accuracy: 9774/64110 (21.78%)
highest_accuracy:37.37% 
 current lr:  0.0009999663790013797
epoch:3,train_loss:1.0988
Validation set: Weighted loss: 1.0963, Accuracy: 30519/64110 (44.63%)
highest_accuracy:44.63% 
 current lr:  0.00010003362099862035
epoch:4,train_loss:1.0983
Validation set: Weighted loss: 1.1079, Accuracy: 3048/64110 (9.00%)
highest_accuracy:44.63% 
 current lr:  0.0009999663790013797
epoch:5,train_loss:1.0988
Validation set: Weighted loss: 1.0966, Accuracy: 39168/64110 (61.85%)
highest_accuracy:61.85% 
 current lr:  0.00010003362099862035
epoch:6,train_loss:1.0990
Validation set: Weighted loss: 1.1008, Accuracy: 2853/64110 (8.10%)
highest_accuracy:61.85% 
 current lr:  0.0009999663790013797
epoch:7,train_loss:1.0987
Validation set: Weighted loss: 1.0998, Accuracy: 5259/64110 (14.91%)
highest_accuracy:61.85% 
 current lr:  0.00010003362099862035

In [0]:
torch.cuda.is_available()

TEST


In [0]:
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import Sampler

import re
import os

#from model import *
#from dataset import TitleDataset, Toidx
#from preprocess import preprocess_, make_new_data

import pickle
from collections import defaultdict



# _ = preprocess_()


with open(data_dir + 'save/word_to_ix_en.pickle', mode='rb') as f:
     word_to_ix_en = pickle.load(f)
with open(data_dir + 'save/word_to_ix_zh.pickle', mode='rb') as f:
     word_to_ix_zh = pickle.load(f)
with open(data_dir + 'save/train_df.pickle', mode='rb') as f:
     train_df = pickle.load(f)
with open(data_dir + 'save/test_df.pickle', mode='rb') as f:
     test_df = pickle.load(f)

#_,given_dic,fixed_dic,forecast_dic = make_new_data(train_df)

with open(data_dir + 'save/fixed_dic.pickle', mode='rb') as f:
    fixed_dic = pickle.load(f)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


EMBEDDING_DIM = 512
HIDDEN_DIM = 128
max_seq_en = 50
max_seq_zh = 100

model = LSTM_Classifier(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)
#model = MLP_Classifier(EMBEDDING_DIM, len(word_to_ix), target_size=3, seq_length=max_seq_length)
#model = Twolang_Classifier(EMBEDDING_DIM, len(word_to_ix_en),len(word_to_ix_zh), target_size=3)

title1_en_test = list(test_df["title1_en"])
title2_en_test = list(test_df["title2_en"])
title1_zh_test = list(test_df["title1_zh"])
title2_zh_test = list(test_df["title2_zh"])
test_tid1 = list(test_df["tid1"])
test_tid2 = list(test_df["tid2"])

id_ = test_df["id"]


preded_id_label = []

given, not_given = 0, 0

agree_dic = defaultdict(list)
disagree_dic = defaultdict(list)

for id1, id_label_list in fixed_dic.items():
    if len(id_label_list) == 0:
        continue
    id_list = np.array(id_label_list)[:,0]
    label_list = np.array(id_label_list)[:,1]
    for id2, label in zip(id_list, label_list):

        if label == 1:
            agree_dic[id1].append(id2)
        elif label == 2:
            disagree_dic[id1].append(id2)


change=0
while True:
    for tid1, agree_id_list in agree_dic.items():
        for tid2 in agree_id_list:
            disagree_to_tid2 = disagree_dic[tid2]
            for dis in disagree_to_tid2:
                if not dis in disagree_dic[tid1]:
                    disagree_dic[tid1].append(dis)
                    change+=1

                if not tid1 in disagree_dic[dis]:
                    disagree_dic[dis].append(tid1)
                    change+=1

            agree_to_tid2 = agree_dic[tid2]
            for dis in agree_to_tid2:
                if not dis in agree_dic[tid1]:
                    agree_dic[tid1].append(dis)
                    change+=1

                if not tid1 in agree_dic[dis]:
                    agree_dic[dis].append(tid1)
                    change+=1
    for tid1, disagree_id_list in disagree_dic.items():
        for tid2 in disagree_id_list:

            agree_to_tid2 = agree_dic[tid2]
            for dis in agree_to_tid2:
                if not dis in disagree_dic[tid1]:
                    disagree_dic[tid1].append(dis)
                    change+=1

                if not tid1 in disagree_dic[dis]:
                    disagree_dic[dis].append(tid1)
                    change+=1

    print("change number: ", change)
    if change == 0:
        break
    else:
        change = 0

mujun = 0

for id1, id2, each_id in zip(test_tid1, test_tid2, id_):
    if id2 in disagree_dic[id1]:
        #check
        if id1 in disagree_dic[id2]:
            preded_id_label.append((each_id, 2))
        else:
            mujun+=1

    elif id2 in agree_dic[id1]:
        #check
        if id1 in agree_dic[id2]:
            preded_id_label.append((each_id, 1))
        else:
            mujun+=1


preded_id_label = []
print("What could be predicted:{}, Contradiction:{}, total:{}".format(len(preded_id_label), mujun, len(test_df)))




#
# for id1, id2, each_id in zip(test_tid1, test_tid2, id_):
#     if not id1 in forecast_dic.keys():
#         #print("label cannot be predicted")
#         not_given+=1
#         pass
#     else:
#         forecast_data_label = np.array(forecast_dic[id1])
#         if len(forecast_data_label) == 0:
#             continue
#
#         forecast_id = forecast_data_label[:,0]
#         forecast_label = forecast_data_label[:,1]
#
#         if id2 in forecast_id:
#             idx = list(forecast_id).index(id2)
#             label = forecast_label[idx]
#              given+=1
#             # preded_id_label.append((each_id, label))
#         else:
#             #print("label not given")
#             not_given+=1
#             pass
# print("予測可能セット:{}, わからないセット:{}".format(given, not_given))


PATH = data_dir+ "model/MLP.model"
PATH_list = [data_dir + "model/{}fold_mlp.model".format(fold) for fold in range(1,6,1)]


average_prediction = []
for PATH in PATH_list:

    model = torch.load(PATH)
    print("model loaded:{}".format(PATH))


    # test dataset. label is None.
    test_dataset = TitleDataset(title1_en_test, title2_en_test, title1_zh_test, title2_zh_test, None,
                                dic_en=word_to_ix_en, dic_zh=word_to_ix_zh, transform=Toidx(),
                                seq_length_en=max_seq_en, seq_length_zh=max_seq_zh, if_test=True)


    test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

    with torch.no_grad():
        model.eval()
        predictions = []
        for batch_idx, sample_batch in enumerate(tqdm(test_loader)):
            en_title1 = sample_batch["t1_en"].to(device)
            en_title2 = sample_batch["t2_en"].to(device)
            zh_title1 = sample_batch["t1_zh"].to(device)
            zh_title2 = sample_batch["t2_zh"].to(device)
            output = model(en_title1, en_title2, zh_title1, zh_title2)

            # pred = output.max(1, keepdim=True)[1].cpu()
            #print("model out :",output.size())
            #predictions.extend(list(pred.numpy()))
            output = output.cpu().numpy()
            #print("model out:",output.shape)

            if batch_idx == 0:
                predictions = output
            else:
                predictions = np.vstack((predictions, output))

    average_prediction.append(predictions)

average_prediction = np.array(average_prediction)
# print("total pred:", average_prediction.shape)
average_prediction = np.mean(average_prediction, axis=0)
# print("total pred:", average_prediction.shape)

predictions = np.argmax(average_prediction, axis=1)
print("predictions:", predictions.shape)

#'unrelated', 0
#'agreed', 1
#'disagreed', 2


if len(preded_id_label) == 0:
    preded_labels = []
    preded_id = []
else:
    preded_id = np.array(preded_id_label)[:, 0]
    preded_labels = np.array(preded_id_label)[:, 1]
print("directly preded label:", len(preded_id))


fixed_predictions = []
for each_id, p in zip(id_, predictions):
    if each_id in preded_id:
        idx = list(preded_id).index(each_id)
        fixed_predictions.append(preded_labels[idx])
    else:
        fixed_predictions.append(p)


new_predictions = []
for p in fixed_predictions:
    if p == 0:
        new_predictions.append("unrelated")
    elif p==1:
        new_predictions.append("agreed")
    elif p==2:
        new_predictions.append("disagreed")


#
# c = Counter(list(predictions))
# print("original",c)
#
# c = Counter(fixed_predictions)
# print("fixed", c)


submit_csv = pd.concat([id_, pd.Series(new_predictions)], axis=1)
#display(submit_csv)

submit_csv.columns = ["Id", "Category"]
submit_csv.to_csv(data_dir + "submit.csv", header=True, index=False)

In [0]:
submit_csv.to_csv(data_dir + "submit.csv", header=True, index=False)
submit = pd.read_csv("submit.csv")

In [0]:
import requests
from bs4 import BeautifulSoup

search = "1000人犯罪团伙来德州偷孩子取器官,男子散播“1000人来德州偷孩子挖器官”谣言"

r = requests.get("https://www.google.com/search", params={'q':search})

soup = BeautifulSoup(r.text, "html.parser")
res = soup.find("div", {"id": "resultStats"})
print (res.text)


About 8 results

TRAIN CHINESE BERT


In [0]:
!pip install pytorch_pretrained_bert
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

from sklearn.model_selection import train_test_split

import re
import os

from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
import copy
# from model import BERT_Classifier
#from dataset import *
from collections import defaultdict
from sklearn.model_selection import KFold

import requests

class BERT_Classifier(nn.Module):
    def __init__(self, bert_model, target_size=3):
        super(BERT_Classifier, self).__init__()

        self.embedding_dim=768
        kernel_num=256
        self.seq_length_en=100

        self.bert_model = bert_model
        # self.conv2_en = nn.Conv2d(1, kernel_num, (2, self.embedding_dim))
        # self.conv3_en = nn.Conv2d(1, kernel_num, (3, self.embedding_dim))
        # self.conv4_en = nn.Conv2d(1, kernel_num, (4, self.embedding_dim))
        # self.Max2_pool_en = nn.MaxPool2d((self.seq_length_en-2+1, 1))
        # self.Max3_pool_en = nn.MaxPool2d((self.seq_length_en-3+1, 1))
        # self.Max4_pool_en = nn.MaxPool2d((self.seq_length_en-4+1, 1))


        # self.fc1 = nn.Linear(kernel_num*3, 300)
        # self.fc1_bn = nn.BatchNorm1d(300)
        # self.fc1_drop = nn.Dropout(p=0.3, inplace=False)
        # self.fc2 = nn.Linear(300, target_size)

        self.fc1 = nn.Linear(768, 768)
        #self.fc1_bn = nn.BatchNorm1d(300)
        self.fc1_drop = nn.Dropout(p=0.3, inplace=False)
        self.activation = nn.Tanh()
        self.fc2 = nn.Linear(768, target_size)

    def forward(self, input_ids, input_mask):
        batch = len(input_ids)

        last_encoder_layer, _ = self.bert_model(input_ids, token_type_ids=None, attention_mask=input_mask, output_all_encoded_layers=False)

        # last_encoder_layer = last_encoder_layer.view(batch, 1, self.seq_length_en, self.embedding_dim)
        #
        #
        # conv2 = F.relu(self.conv2_en(last_encoder_layer))
        # conv3 = F.relu(self.conv3_en(last_encoder_layer))
        # conv4 = F.relu(self.conv4_en(last_encoder_layer))
        #
        # pool2 = self.Max2_pool_en(conv2).view(batch, -1)
        # pool3 = self.Max3_pool_en(conv3).view(batch, -1)
        # pool4 = self.Max4_pool_en(conv4).view(batch, -1)

        #print(last_encoder_layer.size())
        # embedding = torch.sum(last_encoder_layer, 1)

        #cat = torch.cat((pool2, pool3, pool4), dim=1)

        #print("fc1", cat.size())

        first_token_tensor = last_encoder_layer[:, 0]

        # fc1 = self.fc1_drop(F.relu(self.fc1(first_token_tensor)))
        fc1 = self.fc1_drop(self.activation(self.fc1(first_token_tensor)))
        fc2 = self.fc2(fc1)

        return fc2
 

EMBEDDING_DIM = 512
HIDDEN_DIM = 256
max_seq_en = 50
max_seq_zh = 60
EPOCH= 5

batch=64

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)


train_df = pd.read_csv(data_dir + "train.csv")
# test_df = pd.read_csv("data/test.csv")

train_df.replace('unrelated', 0, inplace=True)
train_df.replace('agreed', 1, inplace=True)
train_df.replace('disagreed', 2, inplace=True)


def chinese_clean_series(series):
    def clean_seq(seq):
        seq = str(seq)
        ori = copy.copy(seq)

        seq = seq.replace("< i >", "")
        seq = seq.replace("< / i >", "")
        seq = seq.replace("\n", "")
        seq = re.sub(r'[,."''“”。、#()→⇒←↓↑:;_㊙️【《》=|/<>]+', '', seq)
        seq = re.sub(r'[!!??-]+', ' ', seq)
        seq = re.sub(r'[$]+', '$ ', seq)
        seq = re.sub(r'[0-9]+', '<NUM>', seq)

        if len(seq)==0:
            print("0 lengrh assert!!,",ori, seq)

        return seq

    series = series.apply(clean_seq)
    return series



train_df["title1_zh"] =  chinese_clean_series(train_df["title1_zh"])
train_df["title2_zh"] =  chinese_clean_series(train_df["title2_zh"])


train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)#.iloc[:300, :]


# K-Fold Cross validation
fold_num = 5
kf = KFold(n_splits=fold_num , random_state = 42)
kf.get_n_splits(train_df)

# kf.get_n_splits(X, y)

train_data_list = []
val_data_list = []
for train_index, val_index in kf.split(train_df):
#for train_index, val_index in kf.split(X):
    training_df = train_df.iloc[train_index]
    val_df = train_df.iloc[val_index]



    train1_en, train2_en = list(training_df["title1_en"]), list(training_df["title2_en"])
    train1_zh, train2_zh = list(training_df["title1_zh"]), list(training_df["title2_zh"])

    y_train = list(training_df["label"])

    val1_en, val2_en = list(val_df["title1_en"]), list(val_df["title2_en"])
    val1_zh, val2_zh = list(val_df["title1_zh"]), list(val_df["title2_zh"])
    y_val = list(val_df["label"])


    train_data_list.append((train1_zh,train2_zh, y_train))#train1_zh,train2_zh,y_train))
    val_data_list.append((val1_zh, val2_zh, y_val))# val1_zh, val2_zh,y_val))
#
# with open('save/kfold_train_data.pickle', mode='wb') as f:
#     pickle.dump(train_data_list, f)
# with open('save/kfold_val_data.pickle', mode='wb') as f:
#     pickle.dump(val_data_list, f)


tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
bert_model = BertModel.from_pretrained('bert-base-chinese').to(device)
bert_model.eval()

fold=1
for train_fold, val_fold in zip(train_data_list,val_data_list):
    print("{}/{} fold :".format(fold, fold_num))
    print("train length:{}, val length:{}".format(len(train_fold[0]), len(val_fold[0])))

    (train1_en,train2_en,y_train) = train_fold
    (val1_en, val2_en,y_val) = val_fold

    c = Counter(y_train)
    class_weight = []
    for label, num in sorted(c.items()):
        print(label, num)
        class_weight.append(len(y_train)/(3*num))
    class_weight = torch.FloatTensor(class_weight).to(device)




    model = BERT_Classifier(bert_model)
    model.to(device)
    loss_function = nn.CrossEntropyLoss()#weight=class_weight)
    weighted_loss_function = nn.CrossEntropyLoss(weight=class_weight)#weight=class_weight)




    train_dataset = BERTDataset(train1_zh, train2_zh, y_train, tokenizer, seq_length=max_seq_zh)
    val_dataset = BERTDataset(val1_zh, val2_zh, y_val, tokenizer, seq_length=max_seq_zh)

    class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in y_train])
    samples_weight = torch.from_numpy(samples_weight)
    samples_weight = samples_weight.double()
    sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))

    train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=False, sampler=sampler)#, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch, shuffle=False)
    
    #optimizer = optim.SGD(model.parameters(), lr=0.001)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = CosineAnnealingLR(optimizer, len(train_loader), eta_min = 0.001/10)

    def train(epoch):
        model.train()

        for batch_idx, sample_batch in enumerate(tqdm(train_loader)):
            input_ids = sample_batch["input_ids"].to(device)
            input_mask = sample_batch["input_mask"].to(device)
            input_type_ids = sample_batch["input_type_ids"].to(device)
            y = sample_batch["label"].to(device)
            scheduler.step()
            optimizer.zero_grad()
            outputs = model(input_ids, input_mask)

            loss = loss_function(outputs, y)
            loss.backward()
            optimizer.step()

            if batch_idx%100==0:
                print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))


        print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss)) 

        #print("train data all :", (batch_idx+1)*batch)

        return model



    def test():
        with torch.no_grad():
            model.eval()
            test_loss = 0
            correct = 0

            for batch_idx, sample_batch in enumerate(val_loader):
                input_ids = sample_batch["input_ids"].to(device)
                input_mask = sample_batch["input_mask"].to(device)
                input_type_ids = sample_batch["input_type_ids"].to(device)
                y = sample_batch["label"].to(device)

                output = model(input_ids, input_mask)
                # sum up batch loss
                #test_loss += weighted_loss_function(output, y).item()
                test_loss += loss_function(output, y).item()
                # get the index of the max log-probability
                pred = output.max(1, keepdim=True)[1]
                correct += pred.eq(y.view_as(pred)).sum().item()

            #test_loss /= len(val_loader.dataset)
            test_loss /= batch_idx+1
            #accuracy = 100. * correct / len(val_loader.dataset)

            accuracy = weighted_accuracy(pred, y)

            print('Validation set: Weighted loss: {:.4f}, Weighted Accuracy: {}/{} ({:.2f}%)'
                  .format(test_loss, correct, len(val_loader.dataset),
                          accuracy))

            return test_loss, accuracy


    def weighted_accuracy(pred, true):
        true = true.cpu().numpy()
        pred = pred.cpu().numpy()

        class_weight = [1/16, 1/15, 1/5]
        score = 0
        perfect_score = 0

        for p, t in zip(pred, true):
            if p == t:
                if t == 0:
                    score += 1/16
                    perfect_score += 1/16
                elif t == 1:
                    score += 1/15
                    perfect_score += 1/15
                elif t == 2:
                    score += 1/5
                    perfect_score += 1/5
            else:
                if t == 0:
                    perfect_score += 1/16
                elif t == 1:
                    perfect_score += 1/15
                elif t == 2:
                    perfect_score += 1/5
        #print("score:{}, ideal:{}".format(score, perfect_score))
        return 100 * score/perfect_score

    def save_model(model, val_accuracy, save_path=data_dir + 'model'):
    # if os.path.exists(path + "*.model"):
    #     os.remove(path + "*.model")
        name = "{}fold_mlp.model".format(fold)
        PATH = os.path.join(save_path, name)
        torch.save(model, PATH)

    lowest_loss = 1000000000
    highest_accuracy = 0
    for epoch in range(EPOCH):
        #print(epoch+1)
        model = train(epoch)
        val_loss, accuracy = test()

    #     if val_loss < lowest_loss:
    #         lowest_loss = val_loss
    #         save_model(model)

        if accuracy > highest_accuracy:
            #print("saving model...")
            highest_accuracy = accuracy
            save_model(model, highest_accuracy)
        print("highest_accuracy:{:.2f}% \n".format(highest_accuracy))

    fold+=1


Requirement already satisfied: pytorch_pretrained_bert in /usr/local/lib/python3.6/dist-packages (0.3.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.14.6)
Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.9.62)
Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (4.28.1)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (2.18.4)
Requirement already satisfied: torch>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.0.0)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (0.9.3)
Requirement already satisfied: s3transfer<0.2.0,>=0.1.10 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (0.1.13)
Requirement already satisfied: botocore<1.13.0,>=1.12.62 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (1.12.62)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (2018.11.29)
Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (2.6)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (3.0.4)
Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (1.22)
Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= "2.7" in /usr/local/lib/python3.6/dist-packages (from botocore<1.13.0,>=1.12.62->boto3->pytorch_pretrained_bert) (2.5.3)
Requirement already satisfied: docutils>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.13.0,>=1.12.62->boto3->pytorch_pretrained_bert) (0.14)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil<3.0.0,>=2.1; python_version >= "2.7"->botocore<1.13.0,>=1.12.62->boto3->pytorch_pretrained_bert) (1.11.0)
device: cuda:0
12/10/2018 09:24:30 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.pytorch_pretrained_bert/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00
12/10/2018 09:24:31 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz from cache at /root/.pytorch_pretrained_bert/42d4a64dda3243ffeca7ec268d5544122e67d9d06b971608796b483925716512.02ac7d664cff08d793eb00d6aac1d04368a1322435e5fe0a27c70b0b3a85327f
12/10/2018 09:24:31 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/42d4a64dda3243ffeca7ec268d5544122e67d9d06b971608796b483925716512.02ac7d664cff08d793eb00d6aac1d04368a1322435e5fe0a27c70b0b3a85327f to temp dir /tmp/tmp9xresmxm
12/10/2018 09:24:36 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "type_vocab_size": 2,
  "vocab_size": 21128
}

/usr/local/lib/python3.6/dist-packages/torch/utils/data/sampler.py:115: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  self.weights = torch.tensor(weights, dtype=torch.double)
  0%|          | 0/4007 [00:00<?, ?it/s]
1/5 fold :
train length:256441, val length:64111
0 175383
1 74459
2 6599
  0%|          | 1/4007 [00:01<1:50:37,  1.66s/it]
epoch:1,train_loss:1.1423
  3%|▎         | 101/4007 [02:18<1:29:33,  1.38s/it]
epoch:1,train_loss:1.1396
  5%|▌         | 201/4007 [04:36<1:27:27,  1.38s/it]
epoch:1,train_loss:1.1218
  8%|▊         | 301/4007 [06:53<1:24:58,  1.38s/it]
epoch:1,train_loss:1.1077
 10%|█         | 401/4007 [09:10<1:22:54,  1.38s/it]
epoch:1,train_loss:1.0822
 13%|█▎        | 501/4007 [11:27<1:20:23,  1.38s/it]
epoch:1,train_loss:1.1135
 15%|█▍        | 601/4007 [13:45<1:18:22,  1.38s/it]
epoch:1,train_loss:1.1127
 17%|█▋        | 701/4007 [16:02<1:15:59,  1.38s/it]
epoch:1,train_loss:1.1159
 20%|█▉        | 801/4007 [18:20<1:13:23,  1.37s/it]
epoch:1,train_loss:1.0836
 22%|██▏       | 901/4007 [20:37<1:11:10,  1.37s/it]
epoch:1,train_loss:1.0883
 25%|██▍       | 1001/4007 [22:54<1:08:56,  1.38s/it]
epoch:1,train_loss:1.1167
 27%|██▋       | 1101/4007 [25:12<1:06:37,  1.38s/it]
epoch:1,train_loss:1.0970
 30%|██▉       | 1201/4007 [27:29<1:04:35,  1.38s/it]
epoch:1,train_loss:1.0944
 32%|███▏      | 1301/4007 [29:46<1:01:59,  1.37s/it]
epoch:1,train_loss:1.1065
 35%|███▍      | 1401/4007 [32:04<59:44,  1.38s/it]
epoch:1,train_loss:1.1024
 37%|███▋      | 1501/4007 [34:21<57:27,  1.38s/it]
epoch:1,train_loss:1.1036
 40%|███▉      | 1601/4007 [36:38<55:12,  1.38s/it]
epoch:1,train_loss:1.0996
 42%|████▏     | 1701/4007 [38:55<52:53,  1.38s/it]
epoch:1,train_loss:1.1051
 45%|████▍     | 1801/4007 [41:13<50:42,  1.38s/it]
epoch:1,train_loss:1.0999
 47%|████▋     | 1901/4007 [43:30<48:17,  1.38s/it]
epoch:1,train_loss:1.0990
 50%|████▉     | 2001/4007 [45:47<45:59,  1.38s/it]
epoch:1,train_loss:1.0998
 52%|█████▏    | 2101/4007 [48:05<43:44,  1.38s/it]
epoch:1,train_loss:1.1101
 55%|█████▍    | 2201/4007 [50:22<41:32,  1.38s/it]
epoch:1,train_loss:1.0930
 57%|█████▋    | 2301/4007 [52:39<39:08,  1.38s/it]
epoch:1,train_loss:1.0998
 60%|█████▉    | 2401/4007 [54:57<36:51,  1.38s/it]
epoch:1,train_loss:1.0985
 62%|██████▏   | 2501/4007 [57:14<34:28,  1.37s/it]
epoch:1,train_loss:1.1082
 65%|██████▍   | 2601/4007 [59:31<32:20,  1.38s/it]
epoch:1,train_loss:1.0993
 67%|██████▋   | 2701/4007 [1:01:49<29:56,  1.38s/it]
epoch:1,train_loss:1.0968
 70%|██████▉   | 2801/4007 [1:04:06<27:39,  1.38s/it]
epoch:1,train_loss:1.0977
 72%|███████▏  | 2901/4007 [1:06:23<25:20,  1.38s/it]
epoch:1,train_loss:1.1004
 75%|███████▍  | 3001/4007 [1:08:41<23:06,  1.38s/it]
epoch:1,train_loss:1.0996
 77%|███████▋  | 3101/4007 [1:10:58<20:48,  1.38s/it]
epoch:1,train_loss:1.0893
 80%|███████▉  | 3201/4007 [1:13:15<18:30,  1.38s/it]
epoch:1,train_loss:1.0936
 82%|████████▏ | 3301/4007 [1:15:32<16:11,  1.38s/it]
epoch:1,train_loss:1.1012
 85%|████████▍ | 3401/4007 [1:17:50<13:53,  1.37s/it]
epoch:1,train_loss:1.0945
 87%|████████▋ | 3501/4007 [1:20:07<11:34,  1.37s/it]
epoch:1,train_loss:1.0987
 90%|████████▉ | 3601/4007 [1:22:25<09:18,  1.38s/it]
epoch:1,train_loss:1.1012
 92%|█████████▏| 3701/4007 [1:24:42<07:01,  1.38s/it]
epoch:1,train_loss:1.0978
 95%|█████████▍| 3801/4007 [1:26:59<04:43,  1.38s/it]
epoch:1,train_loss:1.0995
 97%|█████████▋| 3901/4007 [1:29:17<02:26,  1.38s/it]
epoch:1,train_loss:1.1007
100%|█████████▉| 4001/4007 [1:31:34<00:08,  1.38s/it]
epoch:1,train_loss:1.0963
100%|██████████| 4007/4007 [1:31:42<00:00,  1.33s/it]
epoch:1,train_loss:1.0978
Validation set: Weighted loss: 1.1028, Weighted Accuracy: 1667/64110 (12.44%)
/usr/local/lib/python3.6/dist-packages/torch/serialization.py:250: UserWarning: Couldn't retrieve source code for container of type BERT_Classifier. It won't be checked for correctness upon loading.
  "type " + obj.__name__ + ". It won't be checked "
  0%|          | 0/4007 [00:00<?, ?it/s]
highest_accuracy:12.44% 

  0%|          | 1/4007 [00:01<1:47:03,  1.60s/it]
epoch:2,train_loss:1.1006
  3%|▎         | 101/4007 [02:19<1:29:46,  1.38s/it]
epoch:2,train_loss:1.0984
  5%|▌         | 201/4007 [04:36<1:27:29,  1.38s/it]
epoch:2,train_loss:1.0987
  8%|▊         | 301/4007 [06:53<1:25:01,  1.38s/it]
epoch:2,train_loss:1.0995
 10%|█         | 401/4007 [09:11<1:22:31,  1.37s/it]
epoch:2,train_loss:1.0992
 13%|█▎        | 501/4007 [11:28<1:20:24,  1.38s/it]
epoch:2,train_loss:1.0936
 15%|█▍        | 601/4007 [13:45<1:18:04,  1.38s/it]
epoch:2,train_loss:1.0969
 17%|█▋        | 701/4007 [16:03<1:16:02,  1.38s/it]
epoch:2,train_loss:1.0975
 20%|█▉        | 801/4007 [18:20<1:13:44,  1.38s/it]
epoch:2,train_loss:1.1008
 22%|██▏       | 901/4007 [20:37<1:11:12,  1.38s/it]
epoch:2,train_loss:1.1003
 25%|██▍       | 1001/4007 [22:55<1:08:58,  1.38s/it]
epoch:2,train_loss:1.0974
 27%|██▋       | 1101/4007 [25:12<1:06:45,  1.38s/it]
epoch:2,train_loss:1.1006
 30%|██▉       | 1201/4007 [27:29<1:04:16,  1.37s/it]
epoch:2,train_loss:1.1138
 32%|███▏      | 1301/4007 [29:47<1:02:06,  1.38s/it]
epoch:2,train_loss:1.1041
 35%|███▍      | 1401/4007 [32:04<59:46,  1.38s/it]
epoch:2,train_loss:1.1001
 37%|███▋      | 1501/4007 [34:22<57:43,  1.38s/it]
epoch:2,train_loss:1.1017
 40%|███▉      | 1601/4007 [36:39<55:21,  1.38s/it]
epoch:2,train_loss:1.0948
 42%|████▏     | 1701/4007 [38:56<52:51,  1.38s/it]
epoch:2,train_loss:1.1157
 45%|████▍     | 1801/4007 [41:14<50:38,  1.38s/it]
epoch:2,train_loss:1.1056
 47%|████▋     | 1901/4007 [43:31<48:19,  1.38s/it]
epoch:2,train_loss:1.0958
 50%|████▉     | 2001/4007 [45:48<46:03,  1.38s/it]
epoch:2,train_loss:1.1066
 52%|█████▏    | 2101/4007 [48:06<43:42,  1.38s/it]
epoch:2,train_loss:1.0908
 55%|█████▍    | 2201/4007 [50:23<41:28,  1.38s/it]
epoch:2,train_loss:1.0948
 57%|█████▋    | 2301/4007 [52:41<39:09,  1.38s/it]
epoch:2,train_loss:1.0986
 60%|█████▉    | 2401/4007 [54:58<36:50,  1.38s/it]
epoch:2,train_loss:1.1167
 62%|██████▏   | 2501/4007 [57:15<34:31,  1.38s/it]
epoch:2,train_loss:1.0996
 65%|██████▍   | 2601/4007 [59:33<32:15,  1.38s/it]
epoch:2,train_loss:1.1161
 67%|██████▋   | 2701/4007 [1:01:50<29:56,  1.38s/it]
epoch:2,train_loss:1.0961
 70%|██████▉   | 2801/4007 [1:04:08<27:40,  1.38s/it]
epoch:2,train_loss:1.0951
 72%|███████▏  | 2901/4007 [1:06:26<25:26,  1.38s/it]
epoch:2,train_loss:1.0991
 75%|███████▍  | 3001/4007 [1:08:43<23:08,  1.38s/it]
epoch:2,train_loss:1.0928
 77%|███████▋  | 3101/4007 [1:11:01<20:44,  1.37s/it]
epoch:2,train_loss:1.1114
 80%|███████▉  | 3201/4007 [1:13:18<18:33,  1.38s/it]
epoch:2,train_loss:1.0813
 82%|████████▏ | 3301/4007 [1:15:36<16:14,  1.38s/it]
epoch:2,train_loss:1.0885
 85%|████████▍ | 3401/4007 [1:17:53<13:54,  1.38s/it]
epoch:2,train_loss:1.1094
 87%|████████▋ | 3501/4007 [1:20:11<11:36,  1.38s/it]
epoch:2,train_loss:1.1211
 90%|████████▉ | 3601/4007 [1:22:28<09:19,  1.38s/it]
epoch:2,train_loss:1.0918
 92%|█████████▏| 3701/4007 [1:24:46<07:02,  1.38s/it]
epoch:2,train_loss:1.1063
 95%|█████████▍| 3801/4007 [1:27:03<04:44,  1.38s/it]
epoch:2,train_loss:1.1229
 97%|█████████▋| 3901/4007 [1:29:21<02:26,  1.38s/it]
epoch:2,train_loss:1.1005
100%|█████████▉| 4001/4007 [1:31:39<00:08,  1.39s/it]
epoch:2,train_loss:1.0934
100%|██████████| 4007/4007 [1:31:47<00:00,  1.34s/it]
epoch:2,train_loss:1.4589
  0%|          | 0/4007 [00:00<?, ?it/s]
Validation set: Weighted loss: 1.1338, Weighted Accuracy: 1667/64110 (12.44%)
highest_accuracy:12.44% 

  0%|          | 1/4007 [00:01<1:43:04,  1.54s/it]
epoch:3,train_loss:1.0973
  3%|▎         | 101/4007 [02:18<1:29:34,  1.38s/it]
epoch:3,train_loss:1.0986
  5%|▌         | 201/4007 [04:36<1:27:15,  1.38s/it]
epoch:3,train_loss:1.1224
  8%|▊         | 301/4007 [06:53<1:24:54,  1.37s/it]
epoch:3,train_loss:1.0831
 10%|█         | 401/4007 [09:11<1:22:43,  1.38s/it]
epoch:3,train_loss:1.0961
 13%|█▎        | 501/4007 [11:28<1:20:27,  1.38s/it]
epoch:3,train_loss:1.1184
 15%|█▍        | 601/4007 [13:46<1:18:06,  1.38s/it]
epoch:3,train_loss:1.1024
 17%|█▋        | 701/4007 [16:03<1:16:00,  1.38s/it]
epoch:3,train_loss:1.1045
 20%|█▉        | 801/4007 [18:21<1:13:59,  1.38s/it]
epoch:3,train_loss:1.1026
 22%|██▏       | 901/4007 [20:38<1:11:21,  1.38s/it]
epoch:3,train_loss:1.1005
 25%|██▍       | 1001/4007 [22:56<1:09:05,  1.38s/it]
epoch:3,train_loss:1.1009
 27%|██▋       | 1101/4007 [25:13<1:06:36,  1.38s/it]
epoch:3,train_loss:1.0999
 30%|██▉       | 1201/4007 [27:31<1:04:24,  1.38s/it]
epoch:3,train_loss:1.1145
 32%|███▏      | 1301/4007 [29:48<1:02:07,  1.38s/it]
epoch:3,train_loss:1.1002
 35%|███▍      | 1401/4007 [32:05<59:45,  1.38s/it]
epoch:3,train_loss:1.0994
 37%|███▋      | 1501/4007 [34:23<57:28,  1.38s/it]
epoch:3,train_loss:1.0837
 40%|███▉      | 1601/4007 [36:40<55:13,  1.38s/it]
epoch:3,train_loss:1.1088
 42%|████▏     | 1701/4007 [38:58<52:56,  1.38s/it]
epoch:3,train_loss:1.0967
 45%|████▍     | 1801/4007 [41:15<50:35,  1.38s/it]
epoch:3,train_loss:1.1111
 47%|████▋     | 1901/4007 [43:33<48:21,  1.38s/it]
epoch:3,train_loss:1.0960
 50%|████▉     | 2001/4007 [45:50<46:04,  1.38s/it]
epoch:3,train_loss:1.1008
 52%|█████▏    | 2101/4007 [48:07<43:51,  1.38s/it]
epoch:3,train_loss:1.1013
 55%|█████▍    | 2201/4007 [50:25<41:25,  1.38s/it]
epoch:3,train_loss:1.0997
 57%|█████▋    | 2301/4007 [52:42<39:13,  1.38s/it]
epoch:3,train_loss:1.1057
 60%|█████▉    | 2401/4007 [55:00<36:52,  1.38s/it]
epoch:3,train_loss:1.0945
 62%|██████▏   | 2501/4007 [57:17<34:35,  1.38s/it]
epoch:3,train_loss:1.1207
 65%|██████▍   | 2601/4007 [59:35<32:18,  1.38s/it]
epoch:3,train_loss:1.0971
 67%|██████▋   | 2701/4007 [1:01:53<29:59,  1.38s/it]
epoch:3,train_loss:1.0981
 70%|██████▉   | 2801/4007 [1:04:11<27:43,  1.38s/it]
epoch:3,train_loss:1.0984
 72%|███████▏  | 2901/4007 [1:06:28<25:26,  1.38s/it]
epoch:3,train_loss:1.1025
 75%|███████▍  | 3001/4007 [1:08:46<23:03,  1.38s/it]
epoch:3,train_loss:1.0971
 77%|███████▋  | 3101/4007 [1:11:03<20:47,  1.38s/it]
epoch:3,train_loss:1.0945
 80%|███████▉  | 3201/4007 [1:13:21<18:31,  1.38s/it]
epoch:3,train_loss:1.0951
 82%|████████▏ | 3301/4007 [1:15:38<16:12,  1.38s/it]
epoch:3,train_loss:1.0997
 85%|████████▍ | 3401/4007 [1:17:56<13:57,  1.38s/it]
epoch:3,train_loss:1.0974
 87%|████████▋ | 3501/4007 [1:20:13<11:37,  1.38s/it]
epoch:3,train_loss:1.0966
 90%|████████▉ | 3601/4007 [1:22:31<09:21,  1.38s/it]
epoch:3,train_loss:1.0986
 92%|█████████▏| 3701/4007 [1:24:49<07:02,  1.38s/it]
epoch:3,train_loss:1.0993
 95%|█████████▍| 3801/4007 [1:27:06<04:44,  1.38s/it]
epoch:3,train_loss:1.0972
 97%|█████████▋| 3901/4007 [1:29:24<02:26,  1.38s/it]
epoch:3,train_loss:1.0999
 99%|█████████▉| 3969/4007 [1:30:57<00:52,  1.37s/it]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-6-b5be95c70f3f> in <module>()
    321     for epoch in range(EPOCH):
    322         #print(epoch+1)
--> 323         model = train(epoch)
    324         val_loss, accuracy = test()
    325 

<ipython-input-6-b5be95c70f3f> in train(epoch)
    233 
    234             loss = loss_function(outputs, y)
--> 235             loss.backward()
    236             optimizer.step()
    237 

/usr/local/lib/python3.6/dist-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    100                 products. Defaults to ``False``.
    101         """
--> 102         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    103 
    104     def register_hook(self, hook):

/usr/local/lib/python3.6/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     88     Variable._execution_engine.run_backward(
     89         tensors, grad_tensors, retain_graph, create_graph,
---> 90         allow_unreachable=True)  # allow_unreachable flag
     91 
     92 

KeyboardInterrupt: 

In [0]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()


Requirement already satisfied: gputil in /usr/local/lib/python3.6/dist-packages (1.3.0)
Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from gputil) (1.14.6)
Requirement already satisfied: psutil in /usr/local/lib/python3.6/dist-packages (5.4.8)
Requirement already satisfied: humanize in /usr/local/lib/python3.6/dist-packages (0.5.1)
Gen RAM Free: 11.1 GB  | Proc size: 3.0 GB
GPU RAM Free: 10721MB | Used: 720MB | Util   6% | Total 11441MB

In [0]:
!pip install pytorch_pretrained_bert
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm as tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

from sklearn.model_selection import train_test_split

import re
import os
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')

# from model import BERT_Classifier
from dataset import *
from collections import defaultdict
from sklearn.model_selection import KFold
import random


class BERT_Classifier(nn.Module):
    def __init__(self,target_size=3):
        super(BERT_Classifier, self).__init__()

        self.fc1 = nn.Linear(768, 768)
        self.fc1_bn = nn.BatchNorm1d(768)
        self.fc1_drop = nn.Dropout(p=0.3, inplace=False)
        self.fc2 = nn.Linear(768, target_size)

    def forward(self, last_encoder_layer):#, input_ids, input_mask):

        #last_encoder_layer, _ = self.bert_model(input_ids, token_type_ids=None, attention_mask=input_mask, output_all_encoded_layers=False)


        #print(last_encoder_layer.size())
        embedding = torch.sum(last_encoder_layer, 1)
        #print("embedding", embedding.size())

        fc1 = self.fc1_drop(F.relu(self.fc1_bn(self.fc1(embedding))))
        fc2 = self.fc2(fc1)

        return fc2





EMBEDDING_DIM = 512
HIDDEN_DIM = 256
max_seq_en = 50
max_seq_zh = 100
EPOCH=10

batch=32

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)


train_df = pd.read_csv("data/train.csv")

train_df.replace('unrelated', 0, inplace=True)
train_df.replace('agreed', 1, inplace=True)
train_df.replace('disagreed', 2, inplace=True)


X = pd.read_pickle("save/features.pickle")
print("X:", X.shape)
y = list(train_df["label"])


p = list(zip(X, y))
random.shuffle(p)
X, y = zip(*p)
X = np.array(X)
y = np.array(y)


# K-Fold Cross validation
fold_num = 5
kf = KFold(n_splits=fold_num)
kf.get_n_splits(X, y)

train_data_list = []
val_data_list = []
fold=1
for train_index, val_index in kf.split(X):
    X_train = X[train_index]
    X_val = X[val_index]
    y_train = y[train_index]
    y_val = y[val_index]


    print("{}/{} fold :".format(fold, fold_num))
    print("train length:{}, val length:{}".format(len(X_train), len(X_val)))


    c = Counter(y_train)
    class_weight = []
    for label, num in sorted(c.items()):
        print(label, num)
        class_weight.append(len(y_train)/(3*num))
    class_weight = torch.FloatTensor(class_weight).to(device)




    model = BERT_Classifier()
    model.to(device)
    loss_function = nn.CrossEntropyLoss()#weight=class_weight)
    weighted_loss_function = nn.CrossEntropyLoss(weight=class_weight)#weight=class_weight)

    #optimizer = optim.SGD(model.parameters(), lr=0.001)
    optimizer = optim.Adam(model.parameters(), lr=0.001)


    train_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
    val_dataset = torch.utils.data.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))

    #ミニバッチ内のクラス比を揃える.
    class_sample_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in y_train])
    samples_weight = torch.from_numpy(samples_weight)
    samples_weight = samples_weight.double()
    sampler = torch.utils.data.sampler.WeightedRandomSampler(samples_weight, len(samples_weight))

    train_loader = DataLoader(train_dataset, batch_size=batch, shuffle=False, sampler=sampler)#, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    def train(epoch):
        model.train()

        for batch_idx, sample_batch in enumerate(tqdm(train_loader)):
            inputs, y = sample_batch

            inputs = inputs.to(device)
            y = y.to(device)


            optimizer.zero_grad()
            outputs = model(inputs)

            loss = loss_function(outputs, y)
            loss.backward()
            optimizer.step()

        print("epoch:{},train_loss:{:.4f}".format(epoch+1 ,loss))
        #print("train data all :", (batch_idx+1)*batch)

        return model



    def test():
        with torch.no_grad():
            model.eval()
            test_loss = 0
            correct = 0

            for batch_idx, sample_batch in enumerate(val_loader):
                inputs, y = sample_batch
                inputs = inputs.to(device)
                y = y.to(device)


                output = model(inputs)
                # sum up batch loss
                test_loss += weighted_loss_function(output, y).item()
                # get the index of the max log-probability
                pred = output.max(1, keepdim=True)[1]
                correct += pred.eq(y.view_as(pred)).sum().item()

            #test_loss /= len(val_loader.dataset)
            test_loss /= batch_idx+1
            #accuracy = 100. * correct / len(val_loader.dataset)

            accuracy = weighted_accuracy(pred, y)

            print('Validation set: Weighted loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'
                  .format(test_loss, correct, len(val_loader.dataset),
                          accuracy))

            return test_loss, accuracy


    def weighted_accuracy(pred, true):
        true = true.cpu().numpy()
        pred = pred.cpu().numpy()

        class_weight = [1/16, 1/15, 1/5]
        score = 0
        perfect_score = 0

        for p, t in zip(true, pred):
            if p == t:
                if t == 0:
                    score += 1/16
                    perfect_score += 1/16
                elif t == 1:
                    score += 1/15
                    perfect_score += 1/15
                elif t == 2:
                    score += 1/5
                    perfect_score += 1/5
            else:
                if t == 0:
                    perfect_score += 1/16
                elif t == 1:
                    perfect_score += 1/15
                elif t == 2:
                    perfect_score += 1/5
        #print("score:{}, ideal:{}".format(score, perfect_score))
        return 100 * score/perfect_score




    def save_model(model, val_accuracy, save_path="model/BERT/"):
        # if os.path.exists(path + "*.model"):
        #     os.remove(path + "*.model")
        name = "{}fold_mlp.model".format(fold)
        PATH = os.path.join(save_path, name)
        torch.save(model, PATH)

    lowest_loss = 1000000000
    highest_accuracy = 0
    for epoch in range(EPOCH):
        #print(epoch+1)
        model = train(epoch)
        val_loss, accuracy = test()

    #     if val_loss < lowest_loss:
    #         lowest_loss = val_loss
    #         save_model(model)

        if accuracy > highest_accuracy:
            #print("saving model...")
            highest_accuracy = accuracy
            #save_model(model, highest_accuracy)
        print("highest_accuracy:{:.2f}% \n".format(highest_accuracy))

    fold+=1